William Jiang

JavaScript,PHP,Node,Perl,LAMP Web Developer – http://williamjxj.com; https://github.com/williamjxj?tab=repositories

Auto Scraper

#! /usr/bin/perl -w

use lib qw(../lib/);
use kijiji_config;
use db;
use kijiji_us;

use warnings;
use strict;
use Data::Dumper;
use FileHandle;
use WWW::Mechanize;
use DBI;
use Getopt::Long;

local ($|) = 1;
undef $/;

our ( $num, $start_time, $end_time, $end_date ) = ( 0, 0, 0, 0 );
our ( $start_url, $todate ) = ( undef, INTERVAL_DATE );
our ( $mech, $db, $kijiji, $log ) = ( undef, undef );
our ( $dbh, $sth );

$start_time = time;

my ( $host, $user, $pass, $dsn ) = ( HOST, USER, PASS, DSN );
$dsn .= ":hostname=$host";
$db = new db( $user, $pass, $dsn );
$dbh = $db->{dbh};

$kijiji = new kijiji_us( $db->{dbh} ) or die;

$log = $kijiji->get_filename(__FILE__);
$kijiji->set_log($log);
$kijiji->write_log( "[" . $log . "]: start at: [" . localtime() . "]." );

my ( $city, $item ) = ( 'Los Angeles', 'Graphic & web design' );
my ( $jobs, $first, $help, $version, $list ) = ( 'jobs', undef, undef, undef );

usage()
  unless (
	GetOptions(
		'jobs=s'   => \$jobs,
		'first'    => \$first,
		'list'     => \$list,
		'todate=s' => \$todate,
		'city=s'   => \$city,
		'item=s'   => \$item,
		'help|?'   => \$help,
		'version'  => \$version
	)
  );

$help && usage();

if ($first) {
	my $ca1 = $kijiji->select_cities();
	foreach my $ca2 (@$ca1) {
		print $ca2->[0] . "\n";
	}
	exit 1;
}

my $db_name;
if ( $jobs eq 'jobs' ) {
	$db_name = USJOBS;
}
elsif ( $jobs eq 'services' ) {
	$db_name = USSERVICES;
}
else {
	die "There is no suitable job selected.";
}

if ($list) {
	my $list1 = $kijiji->select_items($jobs);
	foreach my $list2 (@$list1) {
		print $list2->[0] . "\n";
	}
	exit 2;
}
if ($version) {
	print <get_us_end_date( $todate );
}
if ( $city && $item ) {
	my ( $r1, $r2 ) = ( '', '' );

	$r1 = $kijiji->select_city($city);
	die "No such city: , $0 quit." unless ($r1);

	$r2 = $kijiji->select_item( $item, $jobs );
	die "No such category: , $0 quit." unless ($r2);

	$start_url = $r1 . $r2 if ( $city && $item );
	$kijiji->write_log( "URL: ." );
}

################################################

$mech = WWW::Mechanize->new( autocheck => 0 );

LOOP:
$mech->get($start_url);
$mech->success or die $mech->response->status_line;
my $html = $kijiji->trunc_html( $mech->content );

my $ary = $kijiji->next_page( $mech->content );
$kijiji->write_log( $ary, $jobs.','.$city.','.$item.'_PAGE' );
# print Dumper($ary);
if ($html =~ m/0 results found in/i) {
	$kijiji->write_log( 'no records found for ' . $jobs.','.$city.','.$item.'.' );
	goto LOOP1;
}

my $aoa = $kijiji->parse_html($html);

my ( $date, $address, $type, $email, $phone, $web, $content, $dateflag );
my ($url, $key, $detail);
# select date_format(str_to_date('10-oct-2006', "%d-%b-%Y"), "%d-%m-%Y");
# my $location = $dbh->quote( $a->[2] );
foreach my $a ( @{$aoa} ) {

	$url = $dbh->quote( $a->[0] );
	$key = $dbh->quote( $a->[1] );

	$num++;
	$mech->follow_link( url => $a->[0] );
	$mech->success or next;

	$detail =
	  $kijiji->parse_sub_html( $kijiji->trunc_sub_html( $mech->content ) );

	$date    = $kijiji->get_mysql_date( $detail->[0], US_DATEFORMAT );
	# william adds in July 14, 2010.
	next unless $date;

	my $dateflag    = $kijiji->compare_end_date($detail->[0], $end_date);
	if ($dateflag disconnect();
		$end_time = time;
		$kijiji->write_log("$todate dates data: total [ " . ($end_time -
			$start_time) . " ] seconds used.\n");
		$kijiji->write_log( "There are total [ $num ] records was
				processed succesfully!\n");
		$kijiji->close_log();
		exit 6;
	}

	$address = $dbh->quote( $detail->[1] );
	$type    = $detail->[2];
	$email   = $dbh->quote( $detail->[3] );
	$phone   = $dbh->quote( $detail->[4] );
	$web     = $dbh->quote( $detail->[5] );
	$content = $dbh->quote( $detail->[6] );

	$kijiji->write_log( "No: " 
		  . $num . " -- [" 
		  . $url . ", " 
		  . $key . ", " 
		  . $city . ", "
		  . $item . ", "
		  . $jobs . ", "
		  . $address . ", "
		  . $type . ", "
		  . $date . ", "
		  . $email . ", "
		  . $phone . ", "
		  . $web
		  . ", \n" );

	$sth = $dbh->do(
		qq{ insert ignore into } . $db_name . qq{
		(url,keywords,summary,city,item,category,
		address,type,post_date,create_date,email,phone,web)
	values($url,$key,$content,'$city', '$item',
		'$jobs', $address, '$type', '$date', now(), $email, $phone, $web) }
	);

	$mech->back();
}

LOOP1:
if ( $ary->[1] && $ary->[2] ) {
	$start_url = $ary->[1];
	goto LOOP;
}

$dbh->disconnect();

$end_time = time;
$kijiji->write_log(
	"There are total [ $num ] records was processed succesfully!\n");
$kijiji->write_log( "$todate, Finally, there are total [ "
	  . ( $end_time - $start_time )
	  . " ] seconds used.\n" );
$kijiji->write_log("----------------------------------------------\n");
$kijiji->close_log();

exit 8;


########################################################

sub usage {
	print <<HELP;
Uage:
      $0
     or:
      $0 -c city -i category
     or:
      $0 -t 3
     or:
      $0 -h  [-v]
Description:
  -t from what date to download? default it's from 2 days before.
  -c city, which city to scrape?
  -i category/item, which category/item to scrape?
  -h this help
  -v version

Examples:
     (1) $0     # use default
     (2) $0 -d  # use default 
     (3) $0 -c vancouver -i services       # scrape vancouver's gigs
     (4) $0 -c calgory -i 'services'
     (5) $0 -h  # get help
     (6) $0 -v  # get version

HELP
	exit 3;
}

###############################
package kijiji;

use lib qw(../lib/);
use kijiji_config;
use common;
@ISA = qw(common);
use strict;
our ($sth);

sub new
{
	my ($type, $dbh_handle) = @_;
	my $self = {};
	$self->{app} = 'kijiji';
	$self->{dbh} = $dbh_handle;
	bless $self, $type;
}
sub trunc_html
{
	my ($self, $html) = @_;
	$html =~ m {
		<table\sid="SNB_Results"
		(.*?)
		</table>
	}sgix;
	return $1;
}

# (?:.*?)				# 1. Wanted
sub parse_3td
{
	my ($self, $html) = @_;
	$html =~ m{
		<a>
		(.*?)				# 3. keywords.
		</a>
		<br />
		(.*)				# 4. summary describe
	}sgix;
	my ($url,$key,$summary) = ($1,$2,$3);
	$summary = $self->trim($summary);
	return [$url,$key,$summary];
}

sub parse_html
{
	my ($self, $html) = @_;
	my $aoh = [];

	while ($html =~ m {
		<tr\sclass="resultsTableSB
		(?:.*?)
		<td>
		(?:.*?)
		<td>
		(.*?)	# 1. the 3rd is useful.
		</td>
		(?:.*?)
		<td>
		(.*?)				# 2. date.
		</td>
		(?:.*?)
		</tr>
	}sgix) {
		my ($t1,$t2) = ($1,$2);
		my $url_key_summary = $self->parse_3td($t1);
		my $date = $self->trim($t2);
		$date =~ s/</[0],$url_key_summary->[1],$url_key_summary->[2],$date]);
	}
	return $aoh;
}

sub next_page
{
	my ($self, $html) = @_;
	my $aoh = [];
	$html =~ m {
		<div>
		\s+(\d+)\s+					# current page
		</div>
		(?:.*?)
		<div>
		(?:.*?)
		<a>			# url
		(\d+)						# next page
		</a>
	}sgix;
	my ($current, $url, $next) = ($1, $2, $3);

	return [$current, $url, $next];
}

sub get_item
{
	my ($self, $html) = @_;
	$html =~ m {
		>
		(?:.*?)
		>
		(?:.*?)>
		(.*?)
		</a>
		(?:.*?)
		>\s+Ad\s+ID
	}sgix;
	return $self->trim($1);
}
sub trunc_sub_html
{
	my ($self, $html) = @_;
	$html =~ m {
		<td>
		(.*?)
		
	}sgix;
	return $1;
}

# (?:.*?)
# Visits:\s+(\d+)						# 4. Visits.
sub parse_sub_html
{
	my ($self, $html) = @_;
	my $aref = [];

	$html =~ m {
		Date\sListed
		(?:.*?)
		<td>
		(.*?)			# 1. Date
		</td>
		(?:.*?)
		<td>
		(?:Address|Location)
		(?:.*?)
		<td>
		(.*?)			# 2. Address
		(?:<br|</td>)
		(?:.*?)
		<!--\sgoogle_ad_section_start\s-->
		(.*)								# 3. Content
		<!--\sgoogle_ad_section_end\s-->
	}sgix;

	my ($date,$address,$content) = ($1,$2,$3);
	$date = $self->trim($date);
	$address = $self->trim($address);

	# print $html unless (defined $content);
	$self->write_log($html) unless (defined $content);
	my $email = $self->get_email($content);
	my $phone = $self->get_phone($content);
	my $web = $self->get_web($content);

	$aref = [$date,$address,$email,$phone,$web];

	return $aref;
}

sub select_city
{
	my ($self, $city) = @_;
	my @row = ();
	$sth = $self->{dbh}->prepare(q{ select curl from }.CITY.qq{ where cname='$city' and area2='ca' });
	$sth->execute();
	@row = $sth->fetchrow_array();
	$sth->finish();
	return $row[0];
}
sub select_item {
	my ( $self, $item, $category ) = @_;
	my @row = ();
	$sth =
	  $self->{dbh}->prepare( qq{ select iurl from } 
		  . ITEM
		  . qq{ where iname='$item' and category='$category' } );
	$sth->execute();
	@row = $sth->fetchrow_array();
	$sth->finish();
	return $row[0];
}
sub select_category_item
{
	my ($self, $item) = @_;
	my @row = ();
	$sth = $self->{dbh}->prepare(qq{ select curl from }.CATEGORY.{ where cname='$item' });
	$sth->execute();
	@row = $sth->fetchrow_array();
	$sth->finish();
	if ($row[0]) {
		return $row[0];
	}
	else {
		$sth = $self->{dbh}->prepare(qq{ select iurl from }.ITEM.qq{ where iname='$item' });
		$sth->execute();
		@row = $sth->fetchrow_array();
		$sth->finish();
		return $row[0];
	}
}
# http://vancouver.kijiji.ca/f-services-W0QQCatIdZ72
sub select_category_url
{
	my ($self, $name) = @_;
	my @row = ();
	my $sth = $self->{dbh}->prepare(qq{ select curl from }.CATEGORY.qq{ where cname = '$name' });
	$sth->execute();
	@row = $sth->fetchrow_array();
	$sth->finish();
	return $row[0];
}

# Loop all canada cities, with $kijiji_scraper.pl -f (first) option.
sub select_ca_cities
{
	my $self = shift;
	my $aref = [];
	# $sth = $self->{dbh}->prepare(q{select cname from } . CITY . qq{ where area2='ca'});
	$sth = $self->{dbh}->prepare(q{select cname from } . CITY . qq{ where area1  'Québec' and area2='ca'});
	$sth->execute();
	$aref = $sth->fetchall_arrayref();
	$sth->finish();
	return $aref;
}

sub select_keywords_email
{
	my ($self, $k, $e) = @_;
	my $sql = "select * from ".TOPIC. " where keywords like '%".$k."%' and email like '%".$e."%'";
	$self->show_results($sql);
}

sub select_keywords
{
	my ($self, $k) = @_;
	my $sql = "select * from " . TOPIC . " where keywords like '%".$k."%'";
	$self->show_results($sql);
}

sub select_email
{
	my ($self, $e) = @_;
	my $sql = "select * from " . TOPIC . " where email like '%".$e."%'";
	$self->show_results($sql);
}

sub get_mysql_date
{
	my ($self, $date) = @_;
	$date = $self->{dbh}->quote($date);
	my $sth=$self->{dbh}->prepare(qq{ select str_to_date($date, "%d-%b-%y") });
	$sth->execute();
	my @row = $sth->fetchrow_array();
	$sth->finish();
	return $row[0];
}

# 25-Mar-10
sub get_end_date
{
	my ($self,$todate) = @_;

	my $sth = $self->{dbh}->prepare(qq{ select date_format(date_sub(now(), interval }. $todate.  qq{ day), '%d-%b-%y') });
	$sth->execute();
	my @row = $sth->fetchrow_array();
	$sth->finish();
	return $row[0];
}

1;

3 responses to “Auto Scraper

  1. Jason Wettstein 09/13/2011 at 3:54 pm

    You do very good work William

  2. powercashadvance.com 10/26/2011 at 4:37 pm

    What would we all do minus the magnificent thoughts you talk about on this web site? Who has the persistence to deal with important topics with regard to common readers like me? My spouse and i and my girlfriends are very lucky to have your web site among the kinds we typically visit. Hopefully you know how considerably we enjoy your working hard! Best wishes through us all.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.

Join 75 other followers

%d bloggers like this: