#!/bin/sh -- # A comment mentioning perl eval 'exec perl -S $0 ${1+"$@"}' if 0; require 5.005; ############################################################################ # Configuration section # Space separated list of email addresses to notify: #$Who = 'runge@karlrunge.com aragon@dcai.com'; $Who = 'runge@karlrunge.com'; # List of URL's to watch: undef(@URLS); @URLS = qw( http://listings.ebay.com/aw/plistings/newtoday/all/category239/index.html http://listings.ebay.com/aw/plistings/newtoday/all/category329/index.html http://listings.ebay.com/aw/plistings/newtoday/all/category332/index.html http://listings.ebay.com/aw/plistings/newtoday/all/category336/index.html http://listings.ebay.com/aw/plistings/newtoday/all/category2399/index.html http://listings.ebay.com/aw/plistings/newtoday/all/category2404/index.html ); # These are the regex patterns to look for: undef(@Patterns); $i = 0; # #$Patterns[$i++] = qr/(?i)sasha/; # this one matches multiple occurrences between HTML tags: $Patterns[$i++] = qr/(?i)sasha([^<]*sasha)*/; $Patterns[$i++] = qr/(?i)\bgregor\b/; # # Notes: the (?i) means case-insensitive match. # the qr/.../ are perl 5.005 quoted regexes. # the \bgregor\b avoids matches to, e.g., "gregory" # # Command to run whenever there is a match. Output of this command is # included in the email message. In the command string $Found_Cmd, the # tokens %URL is expanded to the url, %HREF to the corresponding href # (if any), %MATCH to what matched. Currently, the same command string # is used for all matches. If the href is empty and $Found_Cmd contains # %HREF, then the command is not run. # # Since urls often contain "&" and other shell meta characters, it is # important to use single quotes in your command for %URL and %HREF. # Before insertion into your command, any single quotes in the url, found # href, and match will be replaced with the encoded: %27 # #$Found_Cmd = "lynx -dump '%HREF' | grep -i 'buy.*it.*now.*for'"; # # If non-empty, $Follow_Match is a regex for additional URL's to follow. # That is, any hyperlinks found in the html of @URLS above that match # this pattern will be checked as well. This allows the possibility # of following "Next ..." links without explicitly listing them in the # @URLS array. # # For simplicity, only a single level of recursion is done (to avoid # loops, duplications etc). The resulting urls' html is *not* similarly # searched and followed. # $Follow_Match = qr/\/page\d+.html/; # If non-zero, loop continously sleeping $Loop seconds between checks. $Loop = 0; # # To avoid hitting the server too hard, sleep this many seconds between # web page fetches. # $Wait = 1; # If 1, attempt to trim out all HTML tags before checking for pattern match. $Trim_Tags = 0; # # Path to file with code resembling this Configuration section, to source # (via "require") before each iteration. If you happen to modify the file # while this script is running, your changes will be picked up. # $Config_File = ''; # # $Hit_File is the path to a file with "hits" that we have already found and # do not want to repeat. It is read in at startup time. Whenever a new "hit" # is found it is appended to the file. The format is 1 URL per line. # # Note that a "hit" is not a url from @URLS + a match from @Patterns. # It is that plus a found HREF="..." to the immediate left of the match. # Simply the url in the nearby HREF="..." labels the hit (i.e. no # information about the original URL and the match is currently used). # # For example, suppose we are watching for the string "money" in a URL. # There may be many over time, and we want to be notified of any new ones like: # Free money for you NOW! . # We do not want to be notified each time this "http://foo.com/123/index.html" # "money" link is found when the script is rerun, only the first time. # # The drawback here is that if no HREF is found, there will be repeat # notifications. # $Hit_File = ''; # # For each pattern, an HTML file is processed for matches from beginning # to end. When there is a match, a is searched for to the # immediate left of the match (see above). The HTML is then trimmed from # the location of the match all the way to left and the search process # repeated. This trimming means it is possible for a match to not have # a corresponding HREF (e.g. a pattern matched twice in one hyperlink # label). Set $Must_Have_HREF = 1 to NOT report any matches that do not # have a corresponding HREF. This is also the -r option. # $Must_Have_HREF = 0; # Mail server (not currently used, may be needed for Win* later...) # $Mail_Server = 'smtp.myisp.com'; # Path to sendmail (for mailing on Unix) $Sendmail = '/usr/lib/sendmail'; # # Timeout for url fetching. This is Unix specific, and also only applies # if lynx(1) is being used in the sub fetch_url (see code below). # If set to zero, the lynx command is run directly to get the HTML. # If set greater than zero: a fork() is done, the child runs lynx(), # and an alarm() is set for $Timeout seconds. If the child has not # finished in that time (SIGALRM is trapped), the child is terminated # and the fetch_url returns the empty string as the HTML. # $Timeout = 30; $Debug = 1; 1; # remember this if using -c $Config_File (to make require happy) ############################################################################ $Usage = <<"END"; url_watch: watch for the appearance of strings in web pages and notify via email. Usage: url_watch Options: -w '' List of email addresses to notify. (space separated) -u Add to the urls to watch. -p Add to the patterns to watch for. -f If embedded URLs match , those URLs are searched as well. -l Loop continously, waiting seconds between checks. -s Sleep seconds between URL fetches. -t Trim HTML tags before checking for pattern matches. -H Hit file used to avoid repeat matches. It is read at startup. A hit is a match *and* a found HREF="..." to the left of the match. The HREF url defines the hit, not the match itself. has one URL per line. -r Sets Must_Have_HREF. If a match has no corresponding HREF then do not report it. Each HTML file is processed and trimmed from left to right, so it is possible for a match to not have an HREF to its left. -c Configuration file (containing code like that at the top of this script) is sourced before each check iteration. This allows resetting the parameters, URLs, etc without having to restart the program. File should end with "1;" -m Specify the path to the Unix sendmail program. (Default: $Sendmail) -T Set the timeout for url fetching. See the comments in the Configuration section for details. -C '' Command to run whenever a match is found. See the comments in the Configuration section for details. Notes: Use of -c may override parameters supplied on the cmd line. END ############################################################################ # Process cmd line arguments: LOOP: while (@ARGV) { $_ = shift; CASE: { /^-w$/ && ($Who = shift, last CASE); /^-u$/ && ($tmp = shift, push(@urls, $tmp), last CASE); /^-p$/ && ($tmp = shift, push(@pats, $tmp), last CASE); /^-f$/ && ($Follow_Match = shift, last CASE); /^-l$/ && ($Loop = shift, last CASE); /^-s$/ && ($Wait = shift, last CASE); /^-t$/ && ($Trim_Tags = 1, last CASE); /^-r$/ && ($Must_Have_HREF = 1, last CASE); /^-c$/ && ($Config_File = shift, last CASE); /^-H$/ && ($Hit_File = shift, last CASE); /^-m$/ && ($Sendmail = shift, last CASE); /^-d$/ && ($Debug = 1, last CASE); /^-T$/ && ($Timeout = shift, last CASE); /^-C$/ && ($Found_Cmd = shift, last CASE); /^--$/ && (last LOOP); # -- means end of switches /^-(-.*)$/ && (unshift(@ARGV, $1), last CASE); /^(-h|-help)$/ && ((print STDERR $Usage), exit 0, last CASE); if ( /^-(..+)$/ ) { # split bundled switches: local($y, $x) = ($1, ''); foreach $x (reverse(split(//, $y))) { unshift(@ARGV,"-$x") }; last CASE; } /^-/ && ((print STDERR "Invalid arg: $_\n$Usage"), exit 1, last CASE); unshift(@ARGV,$_); last LOOP; } } use File::Basename; select(STDERR); $| = 1; select(STDOUT); $| = 1; # Check for existence of needed files: if ( $Config_File ne '' && ! -f $Config_File ) { die "No config file: $Config_File, $!"; } if ( $Sendmail ne '' && ! -x $Sendmail ) { die "Cannot find sendmail: $Sendmail, $!"; } if ( $Hit_File ne '' && ! -f $Hit_File ) { print STDERR "Warning: no hit file: $Hit_File, $!\n"; print STDERR "(will try to create it if necessary)\n"; } # # Read in the previous hits from $Hit_File. It is OK if $Hit_File doesn't # exist yet: we will create it on the first write to it in check_url(). # if ( $Hit_File ne '' && open(HITS, "<$Hit_File") ) { while () { chomp; $Already_Seen{$_} = 1; } close(HITS); } # Use any lists supplied via the cmd line: @URLS = @urls if @urls; @Patterns = @pats if @pats; # Do the checking: while (1) { check_them(); last if ! $Loop; print STDERR "sleeping $Loop ...\n"; sleep($Loop); } exit 0; ############################################################################ # # Top level subroutine to check all of the urls. # sub check_them { @Found = (); # Matches will be placed in this array, and then # emailed all at once via notify() below. if ( $Config_File ne '' ) { # Source the config file, it may have been edited since we # last read it in. require "$Config_File"; } foreach my $url (@URLS) { next if $url =~ /^#http:/; my $html = check_url($url); if ( $Follow_Match ne '' ) { # now we follow matches in the toplevel HTML: foreach my $url2 ( follow($url, $html) ) { check_url($url2); } } } if ( @Found ) { # we found some matches, so send off a message: notify($Who, @Found); } } # # Get and check the HTML of a URL for matches to @Patterns. # Returns the html retrieved corresponding to $url and stores any # matches in the global @Found array. # sub check_url { my ($url) = @_; my $html = fetch_url($url); sleep $Wait if $Wait; # Sleep a bit to not hit server too hard. my $src = $html; if ( $Trim_Tags ) { $src = remove_html_comments($src); $src = remove_html_tags($src); } # First, collect which patterns matched at all: my @matches = (); foreach my $patt (@Patterns) { if ( $src =~ /$patt/ ) { push(@matches, $patt); } } # # Next, try to extract useful info from the html source i.e. a # corresponding HREF="URL" *before* the match and the link # name itself: # my ($before, $match, $after); my $now = scalar(localtime); foreach my $patt (@matches) { my $str = $html; my $out = "$url\n"; # $out is output for part of the email message. while ( $str =~ /$patt/ ) { $before = $`; $match = $&; $after = $'; $str = $after; # Set $str to remainder of string. This shrinks # it so ultimately there will be no match. my $msg = " $match\t$now\n"; # # Try to find any HREF to the *left* of match, # i.e. it is blah MATCH blah # ^^^^^^^^^^ ^^^^^ # sets href to "http://..." # sets title to " blah MATCH blah " # my $href = ''; my $title = ''; if ( $before =~ /<\s*A[^>]+HREF\s*=\s*([^>\s]+)[^>]*>([^<]*)$/i ) { # < A HREF = ( $1 ) >( $2 ) $title = $2 . $match; $href = href2url($1, $url); if ( $after =~ /^([^<]+) $title .= $1; } # complete the message info: $msg .= " $title\n"; $msg .= " $href\n"; } if ( $href eq '' && $Must_Have_HREF ) { # must have href ... print STDERR " no href found: \($match)\n" if $Debug; next; } elsif ( $href ne '' && $Already_Seen{$href} ) { # we skip any url's previously seen: print STDERR " already saw: $href \($match)\n" if $Debug; next; } else { print STDERR " found: $href \($match)\n" if $Debug; if ( $href ne '' ) { $Already_Seen{$href}++; # append to the "hit" file: if ( $Hit_File ne '' ) { open(HITS, ">>$Hit_File") || die "$!"; print HITS $href, "\n"; close(HITS); } } } if ( $Found_Cmd ne '' ) { # # Pass our parameters to the found command, # run it, and append the output to the message: # if ( $href ne '' || $Found_Cmd !~ /%HREF/ ) { my $cmd = $Found_Cmd; my $u = $url; $u =~ s/'/%27/g; my $h = $href; $h =~ s/'/%27/g; my $m = $match; $m =~ s/'/%27/g; $cmd =~ s/%URL/$u/g; $cmd =~ s/%HREF/$h/g; $cmd =~ s/%MATCH/$m/g; my $cmdout = `$cmd`; foreach my $l (split(/\n/, $cmdout)) { $msg .= " - $l\n"; } } } # append it to the found list. push(@Found, $out . $msg . "\n"); } } return $html; } # # Extract URLs from $html. Return a list of any that match $Follow_Match. # sub follow { my ($url0, $html) = @_; my ($url, @list, %saw); while ( $html =~ /<\s*A[^>]+HREF\s*=\s*([^>\s]+)/i ) { $html = $'; $url = href2url($1, $url0); next unless $url =~ /$Follow_Match/o; next if $saw{$url}++; push(@list, $url); } return @list; } # # Retrieve a URL via LWP::Simple::get or lynx(1). Returns the HTML. # sub fetch_url { my ($url) = @_; # LWP module is the best way to go. But is it ever in core perl?? Grrr. my $have_LWP = 0; my $html = ''; print STDERR "fetch_url: $url\n" if $Debug; if ( $have_LWP ) { # the "use" below can only be uncommented when LWP module # exists, otherwise leads to compile-time error. # # use LWP::Simple; $html = get($url); } else { # the shell hacker's friend: lynx(1) # This is UNIX specific. Esp. the SIGALRM/alarm() my $timeout = $Timeout; if ( $timeout <= 0 ) { # Old, simple way, but could block. # Also, protect single quotes just to be sure. $url =~ s/'/%27/g; $html = `lynx -source '$url' 2>&1`; } else { $PID = open(HTML, "-|"); if ( ! defined($PID) ) { print STDERR "fetch_url: problem forking. $!\n"; return ''; } elsif ( ! $PID ) { exec 'lynx', '-source', $url; exit 1; } $SIG{ALRM} = sub { $SIG{ALRM} = 'DEFAULT'; print STDERR "fetch_url: timed out. killing $PID\n"; kill INT, $PID; kill KILL, $PID; undef($PID); }; alarm($timeout); while () { last if ! defined($PID); $html .= $_; } alarm(0); $SIG{ALRM} = 'DEFAULT'; close(HTML); } } return $html; } # # Given an HREF $href extracted from the HTML corresponding to URL # $initial_url, return the full URL that HREF points to. # Requires "use File::Basename" # sub href2url { my($href, $initial_url) = @_; # first clean up quotes on the ends: $href =~ s/^["\\]*//; $href =~ s/["\\]*$//; my $host = $initial_url; # remove http:// my $proto = "http"; $proto = $1 if $host =~ s,^(\w+)://,,; # extract host part: hostname.com/... my $path = "/"; $path = $1 if $host =~ s,(/.*)$,,; if ( $initial_url eq "$proto://$host" ) { # add missing / for this case $initial_url = "$proto://$host/"; } my $url = ''; if ( $href =~ m,^(\w+)://, ) { # it is a full url with proto:// $url = $href; } elsif ( $href =~ m,^/, ) { # it is path rooted at / $url = "$proto://$host$href"; } else { # it is a relative path if ( $initial_url =~ m,/$, ) { # source url ended in / (must be directory) $url = "$initial_url$href"; } else { # TODO this is approximate since: # foo could -> foo/ # after retrieving (say from 304 follow directive). # # E.g. $initial_url=http://foo.com/somedir rather than # $initial_url=http://foo.com/somedir/ # # Supposedly if $initial_url matches .html or .htm # it is "obviously" OK, but we do not check that # and just keep our fingers crossed for this case: # my $tmp = dirname($href); $url = "$tmp/$href"; } } return $url; } sub remove_html_comments { my ($html) = @_; # remove HTML comments. Fails for nested ones. # $html =~ s// /g; $html = remove_nested_html_comments($html); return $html; } sub remove_nested_html_comments { my ($html) = @_; # # This should be pretty good at removing nested comments. # (btw, are such things even allowed?) # It will fail noticing quoted , but those should # be very rare. # $html =~ s/%/___QUOTE_PERCNT_$$_/og; $html =~ s/=/___QUOTE_EQUALS_$$_/og; $html =~ s/<\s*!--/%/g; $html =~ s/--\s*>/=/g; while ( $html =~ /%[^%=]*=/ ) { $html =~ s/%[^%=]*=//sg; } $html =~ s/%//g; # should not be any now in the normal case. $html =~ s/___QUOTE_PERCNT_$$_/%/og; $html =~ s/___QUOTE_EQUALS_$$_/=/og; return $html; } sub remove_html_tags { my ($html) = @_; # remove HTML tags. Fails to notice quoted < and > (should be rare) $html =~ s/<(.|\n)*?>/ /g; return $html; } # # Send an email notification. # sub notify { my ($who, @msg) = @_; print STDERR "notify: who=$who\n", @msg, "\n" if $Debug; # open a pipe to sendmail(8). This is UNIX specific. open(MAIL, "|$Sendmail -t") || die "$!"; print MAIL "To: $Who\n"; print MAIL "Subject: url_watch\n\n"; print MAIL @msg, "\n"; close(MAIL); }