#!/bin/sh -- # A comment mentioning perl, to prevent perl from looping. Indented to work with bash. eval 'if [ "`type perl5 2>&1 | grep -i not.found`" = "" ]; then \ MY_PERL=perl5; export MY_PERL; \ exec perl5 -S -w $0 ${1+"$@"}; \ else \ MY_PERL=perl; export MY_PERL; \ exec perl -S -w $0 ${1+"$@"}; \ fi' if 0; ############################################################################## # For a description, go down to the $Usage statement below, or run # this program with the -help flag. ############################################################################## $Program = 'cobwebs'; $Version = '0.2'; $Copyright = "$Program v$Version Copyright (c) 1997, 2000 Karl J. Runge"; ############################################################################## # Hack to let user force usage with perl4 on Unix... (not completely done). # if ( !defined($ENV{'TRY_PERL4'}) || ! $ENV{'TRY_PERL4'} ) { # Exit if not Perl5. die "\"$Program\" needs perl version 5 or greater." if $] < 5.000; } # Flag to use when trying perl4 compatibility. (not used). $Perl5 = 1; $Perl5 = 0 if $] < 5.000; ############################################################################## # Configuration: # # You can set things here, but most all of the following can be set # via command line switches. # $Is_Sysv = 0; # E.g. 1 for Solaris 0 for BSD/Linux # Cmdline: -sysv $Root_Dir = "."; # Default location of Web site's # HTML source tree. # Cmdline: -root $Unix = 1; # Whether machine is Unix or not. # Cmdline: -unix or -windows $Timeout = 0; # Set the timeout for hung connections # this will be either used by select() # or by alarm(). # Cmdline: -timeout $Wait_Method = ''; # Set an alarm (On Unix) to timeout # hung connections after $Timeout seconds. # Or use select() # Cmdline: -alarm or -select $Remove_Comments = 1; # Whether to remove # comments. 0 => check inside comments. # Cmdline: -check_comments $Keep_Dots = 0; # 1 => keep a/b/c/../foo.gif in paths. $Skip_Backups = 1; # Skip backup files and dirs. # 0 => check them too. # Cmdline: -check_backups $File_Skip_Match = 'SCCS/|RCS/|Bak/'; # Pattern to indicate backup files+dirs # e.g. SCCS/ dirs are skipped. # Cmdline: -file_skip $Url_Skip_Match = ''; # Pattern to indicate which URL's # to skip checking. They will be # flagged as OK. # Example 'JavaScript:|...' $Use_Httpd = ''; # If set, local files are checked # the HTTP server instead of local # filesystem. Set to http://host.domain # Cmdline: -use_httpd $Report_Header = 0; # Report the full HTTP return header. # Cmdline: -show_header $Head_Only = 0; # Use only the HEAD HTTP method, no GET. # Cmdline: -head_only $File_Match = '*.htm*'; # Unix find glob pattern. Only HTML # files matching this are checked. # Cmdline: -m $Links_Only = 0; # Set to 1 means do not check links, # just return a list of them. # Cmdline: -links_only $Log_File = "/tmp/$Program.$$"; # Default place to log the output. $Log_File0 = $Log_File; # Errors go to $Log_File.errs $Log_File_Windows = 'cobtmp.out'; # Use this for windows. # Cmdline: -l $Check_Remotes = 1; # Set to 0 means only check local links. # Cmdline: -local_only $Check_Anchors = 1; # Check links. # Cmdline: -no_anchors $Check_Images = 1; # Check links. # Cmdline: -no_images $Check_Actions = 1; # Check
links. # Cmdline: -no_actions # (NOT IMPLEMENTED) $Check_Mailto = 0; # Check mailto:user@host links. # Cmdline: -mailto $Shutdown = 0; # call shutdown(2) after sending request. $Shutdown_Match = ''; # Hostname guesses for mailer. i.e. mail.domain.com, mailhost.domain.com, etc... @Mail_Guess = ( 'mail', 'mailhost', 'mx' ); # Any more? $Who_To_Email = ''; # Recipients to e-mail errors to. $Mail_Command = 'mailx'; # Mail command: mail_cmd -s "subj" who < file system("type mailx >/dev/null 2>&1"); if ( $? ) { $Mail_Command = "mail"; } else { $Mail_Command = "mailx"; } $Mail_Subj = "$Program Errors found"; # Cmdline: -who $Check_Telnet = 0; # Check telnet:host links. # Cmdline: -telnet $Check_Rlogin = 0; # Check rlogin:user@host links. # Cmdline: -rlogin $Debug = 0; # Turn on debugging output. # Cmdline: -d $Verbose = 0; # Print each output line to screen # as it comes up. # Cmdline: -v $Number_of_Jobs = 1; # When doing the link checking, # Create this many processes and # do the work in parallel. # Cmdline: -jobs $Proxy_Host = 'NONE'; # Link checking to be done via a # HTTP proxy host. # Cmdline: -proxy_host $Proxy_Port = ''; # Port number of the proxy host. # Cmdline: -proxy_port $Ftp_Passwd = 'billg@microsoft.com'; # Used for anonymous FTP. $NL = '__NEWLINE__'; # Various substitutional strings $CB = '__COMMENT_BEGIN__'; # used in regex's below. $CE = '__COMMENT_END__'; # Not set on command line. $PIPE = ''; # Replace "|" $COBERR = '__COBWEB_ERROR__'; # Generic error string. $Use_Internal_Find = 0; # 1 => Use module File::Find # User Agent lines to send to # a remote http server. # Cmdline: -user_agent # Cmdline: -no_user_agent $User_Agent_Lines = <<'END'; User-Agent: Mozilla/4.89 [en] (X11; I; Linux 2.8.11 i686) Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */* Accept-Language: en Accept-Charset: iso-8859-1,*,utf-8 END $Retrieve = ''; # D $Report_Local = 0; ############################################################################## $Usage = <<"END"; $Program: Search through an HTML source tree and look for bad hyperlinks. This program is run from the root directory of a Web site''s HTML source tree. It goes through and finds all HTML source files. It reads and parses each file, (usually) detecting all the hyperlinks ( and the following "protocols": mailto: ftp:// gopher:// telnet:// rlogin:// It is, of course, not perfect, but it does find a lot of cobwebs. ;-) Usage: $Program [] {file | dir} ... You list the files you want checked and/or directories recursively searched on the command line. These paths should be relative to the Server root dir of the Website''s HTML source tree. Default is ".", i.e. search the whole tree. So you can just cd to the HTML root and type "$Program" to run the check, however since that is rather quiet, it is better to first type "$Program -v" to get some output as the program is running. Options: -h, -help This help. -v Verbose mode. Print each piece of information to the screen as it is found, in addition to writing it to the log file. -verbose works too. -d Turn on extra debugging printing. -debug works too. -l Place output in file "file" and errors in file "file.errs". -logfile works too. -root Directory should be the root of the Web site source. This simply does a chdir() to before starting. -m Check files matching glob Default: $File_Match -match works too. -file_skip Skip (e.g. backup) files and dirs matching perl regular expression Default: $File_Skip_Match -url_skip Skip checking any URL matching -check_backups Don''t skip the backup files matching $File_Skip_Match. -check_comments Check hyperlinks found inside too. -keep_dots Do not try to remove upward references like: a/b/../c.gif => a/c.gif This is useful to see if your ../ links just work by the luck of the browser or http server or if they really point to where you intend. -use_httpd Test even local files by HTTP. should refer to the server of the HTML tree being checked (e.g. http://host.domain) -show_header Include the returned HTTP in the report even for OK links. -head_only Use only the HTTP "HEAD" method, no GET. (Good netizen mode) (GET is used to check "#" anchors though) -timeout Set timeout to wait for hung connections. The timeout will be seconds. Default is to not check for hung connections. -alarm Use alarm() to wait for the hung connection. (Unix only) -select Use select() on the file handle to wait for the hung connection (used on Windows). Both will wait "-timeout" seconds. Note that select() is mixed with buffered sockethandle reads , so there may be problems on some platforms. If you don''t know which of -alarm or -select to pick, don''t specify either and one will be automatically selected whenever you use the -timeout switch. -shutdown Call shutdown(2) after sending the HTTP request. -shutdown_match Call shutdown(2) if URL matches pattern. -jobs When doing the link verifying, create this many processes (on Unix) checking the links in parallel. -links_only Do not check any hyperlinks, just report a list of the hyperlink URL''s found. -local_only Do not check hyperlinks that require a remote connection. (i.e. only checks the local filesystem) -no_anchors Do not check the Anchor links -no_images Do not check the Image links -no_actions Do not check the Form Action links (ACTIONS NOT IMPLEMENTED) -mailto Check mailto: URLs. Looks for working e-mail address. Hostname guesses for mail server are: @Mail_Guess (will be prefixed to domain) -telnet Check telnet: URLs. Looks for working telnet connection. (connection only, no dialog) -rlogin Check rlogin: URLs. Looks for working rlogin connection. (connection only, no dialog or auth) -windows Machine is Windows. (Requires Win32 perl in PATH) -unix Machine is Unix. -sysv Machine has SVR4 style sockets (e.g. Solaris) Default is has BSD style sockets (e.g. Linux). -who E-mail the errors found to the people on Hacks: If one of the recipients is "GT:n" where "n" is an integer, then the mail is only sent if there are more than "n" errors. Also, if one of the recipients is "WRAP" then the E-mail body is passed thru "$Program -wrap" to make the output a bit more readable. -proxy_host Don''t make direct connections when checking remote URLs, use proxy instead. -proxy_port The proxy''s port number is . -user_agent Send to remote http servers after the GET line. Percent-signs (%) will be converted to newlines except \% which goes to %. -no_user_agent Don''t sent any User-Agent lines to remote servers. -internal_find Use the Perl5 File::Find module instead of the Unix /bin/find utility. -wrap If the wide one-line-per-url output is to difficult for you to view, run an output file thru "$Program -wrap" to wrap the output to multi-lines and also "format" a bit for better readability. E.g: $Program -wrap cob.out where "cob.out" is a one-line-per-url output file. The formatted output goes to stdout. Notes: Local references to people''s home websites, e.g. /~fred/index.html will not be expanded. Remote ones, http://foo.com/~fred/index.html of course will be expanded correctly. To check the local ones, you could use -use_httpd . TODO: mention -use_httpd http://server/~fred/ "$Program" has been ported to work on Win95 and WinNT using Perl5. Limitations: it cannot fork() parallel jobs or send email (TBD). A way to start it on Windows would be something like: perl $Program ... Provided perl is in the PATH and "$Program" in the current directory. See the Win32 Perl docs for more info on launching scripts. This program is mainly intended to be run as a batch job, e.g. via crontab(1). It is not blazingly fast. The -jobs switch can be used (on Unix) to spawn processes verifying links in parallel. Yes, there are a lot of options, and they have long names, so why not make yourself a little script, that sets your basic options which you can easily comment out functionality when you want to, and even type in extra options at the cmd line when you want to e.g.: #!/bin/sh root=\$HOME/www_src uskip="javascript:|JavaScript:" extras="-mailto -telnet -rlogin" httpd="-use_httpd http://www.karlrunge.com/" timeout="-timeout 240" misc="-verbose" $Program \$misc \$timeout \$extras \$httpd -url_skip "\$uskip" -root \$root \$* exit \$? A script like the above could be the basis for a daily or weekly cron job. (see the -who e-mail flag). $Copyright END ############################################################################## #Todo: # better find globbing pattern. Use perl grep()? # does File::Find follow symlinks? # local_url_ok checks readability of file too. # Handle ~user/foo.html links. (SKIP, -use_httpd instead) # what about symlinks to far away places (outside root)? # follow 300 and 400 REDIRS with Location: info (DONE) # use mx(1) to check mailto's (DONE) # Send Accept */* (DONE) # windows perl check. (DONE) # extract_host gets user too (DONE) # Mail errors on Unix (DONE) # wrap lines output mode (DONE) # work around unportability in early File::Find module (DONE) # windows find? (DONE) # sub unbuffer(FH) (DONE) # ftp_ok check (DONE) # add mx to mailhosts (DONE) # windows logfile name (DONE) # HEAD only mode (DONE) # -show_header switch to always show returned HTTP header. (DONE) # keep_dots mode (DONE) # remote ..'s (DONE) # alarm and select wait mode switches driven by timeout. (DONE) # try_perl4 mode, note perl5 (DONE) # glob *.htm at same time. (DONE) # implement mailto (DONE) # implement telnet (DONE) # implement rlogin (DONE?) # fork parallel jobs (DONE) # function to test http_status (DONE) # dot remove and dot dot removal(DONE) # preserve Use_Http path after dotdot (DONE) # port to windows. (Slashes? Logfile only!) (DONE) # anything to examine from httpd.conf? (SKIP) # minimal gopher support w/o proxy (DONE) # HREF="JavaScript:foo('bar.html')" (SKIP) # Add Skip Match pattern (DONE) ############################################################################## # Command line argument processor: LOOP: while (@ARGV) { $_ = shift @ARGV; CASE: { /^-h$|^-help$/ && ((&help()), exit 0, last CASE); /^-wrap$/ && ((&do_wrap()),exit 0, last CASE); /^-d$|^-debug$/ && ($Debug = 1, last CASE); /^-root$/ && ($Root_Dir = shift, last CASE); /^-m$|^-match$/ && ($File_Match = shift, last CASE); /^-file_skip$/ && ($File_Skip_Match = shift, last CASE); /^-url_skip$/ && ($Url_Skip_Match = shift, last CASE); /^-check_comments$/ && ($Remove_Comments = 0, last CASE); /^-keep_dots$/ && ($Keep_Dots = 1, last CASE); /^-check_backups$/ && ($Skip_Backups = 0, last CASE); /^-use_httpd$/ && ($Use_Httpd = shift, last CASE); /^-show_header$/ && ($Report_Header = 1, last CASE); /^-head_only$/ && ($Head_Only = 1, last CASE); /^-timeout$/ && ($Timeout = shift, last CASE); /^-alarm$/ && ($Wait_Method = 'alarm', last CASE); /^-shutdown$/ && ($Shutdown = 1, last CASE); /^-shutdown_match$/ && ($Shutdown_Match = shift, last CASE); /^-select$/ && ($Wait_Method = 'select', last CASE); /^-jobs$/ && ($Number_of_Jobs = shift, last CASE); /^-who$/ && ($Who_To_Email = shift, last CASE); /^-links_only$/ && ($Links_Only = 1, last CASE); /^-l$|^-logfile$/ && ($Log_File = shift, last CASE); /^-user_agent$/ && ($User_Agent_Lines = shift, last CASE); /^-no_user_agent$/ && ($User_Agent_Lines = '', last CASE); /^-retrieve$/ && ($Retrieve = shift, last CASE); /^-report_local$/ && ($Report_Local = 1, last CASE); /^-v$|^-verbose$/ && ($Verbose = 1, last CASE); /^-no_anchors$/ && ($Check_Anchors = 0, last CASE); /^-no_images$/ && ($Check_Images = 0, last CASE); /^-no_actions$/ && ($Check_Actions = 0, last CASE); /^-mailto$/ && ($Check_Mailto = 1, last CASE); /^-telnet$/ && ($Check_Telnet = 1, last CASE); /^-rlogin$/ && ($Check_Rlogin = 1, last CASE); /^-local_only$/ && ($Check_Remotes = 0, last CASE); /^-win$/ && ($Unix = 0, last CASE); /^-unix$/ && ($Unix = 1, last CASE); /^-sysv$/ && ($Is_Sysv = 1, last CASE); /^-proxy_host$/ && ($Proxy_Host = shift, last CASE); /^-proxy_port$/ && ($Proxy_Port = shift, last CASE); /^-internal_find$/ && ($Use_Internal_Find = 1, last CASE); /^--$/ && (last LOOP); /^-(-.*)$/ && (unshift(@ARGV, $1), last CASE); if ( /^-(..+)$/ ) { # split bundled switches: local($y, $x) = ($1, ''); foreach $x (reverse(split(//, $y))) { unshift(@ARGV,"-$x") }; last CASE; } /^-/ && ( (print "\n$Program: $_ is not an option, try \"$Program -help\".\n"), exit 1, last CASE); unshift(@ARGV,$_); # put back if done with flags. (the rest # of @ARGV are directories or files) last LOOP; } } ############################################################################## # Record startup time. $Start_Time = &date; ############################################################################## # Check for Windows. if ( $Unix && &Check_Windows() ) { $Unix = 0; } ############################################################################## # Open up the log file for output. if ( ! $Unix ) { if ( $Log_File eq $Log_File0 ) { # Choose a safe Windows filename $Log_File = $Log_File_Windows; } $Log_File =~ s,\\,/,g; } open(LOG, "/dev/null") if 0; # Jus' to avoid warning msg. open(LOG, ">$Log_File") || die "cannot open \"$Log_File\""; $Log = 'LOG'; select(STDERR); $| = 1; select(STDOUT); $| = 1; ############################################################################## # Decide timeout wakeup method: if ( $Timeout > 0 ) { if ( $Wait_Method eq 'alarm' ) { if ( ! $Unix ) { &Log("Info: Changing wait method to 'select' on non-Unix\n"); $Wait_Method = 'select'; } } elsif ( $Wait_Method eq 'select' ) { ; # equally OK for both OS's. } else { if ( $Unix ) { $Wait_Method = 'alarm'; } else { $Wait_Method = 'method'; } } } ############################################################################## # Unbuffer all output streams: (flushed every print) &unbuffer(LOG); &unbuffer(STDERR); &unbuffer(STDOUT); ############################################################################## # This string is for informational message only: $Items_To_Check = ''; $Items_To_Check = join(' ', @ARGV); if ( $Items_To_Check =~ /^\s*$/ ) { $Items_To_Check = "."; } ############################################################################## # Process User_Agent_Lines percents to newlines: $User_Agent_Lines =~ s/\\%/__PERCENT_PLACEHOLDER__/g; $User_Agent_Lines =~ s/%/\n/g; $User_Agent_Lines =~ s/__PERCENT_PLACEHOLDER__/%/g; $User_Agent_Lines .= "\n" unless $User_Agent_Lines =~ /\n$/; $User_Agent_Lines_1 = $User_Agent_Lines; $User_Agent_Lines_1 =~ s/\n/ % /g; $User_Agent_Lines =~ s/\n/\r\n/g; ############################################################################## # Windows probably has no Unix-ish /bin/find # if ( ! $Unix ) { $Use_Internal_Find = 1; } ############################################################################## # This is a list of all configurable variables. # @Params = ( Program, Version, Start_Time, Root_Dir, Items_To_Check, Remove_Comments, Skip_Backups, File_Skip_Match, Url_Skip_Match, File_Match, Links_Only, Check_Remotes, Check_Anchors, Check_Images, Check_Actions, Check_Mailto, Check_Telnet, Check_Rlogin, Log_File, Unix, Number_of_Jobs, Is_Sysv, Debug, Verbose, Report_Header, Head_Only, Use_Httpd, Keep_Dots, Who_To_Email, Mail_Command, Wait_Method, Timeout, Proxy_Host, Proxy_Port, Report_Local, Retrieve, User_Agent_Lines_1, ); # Show the user the values of the set of configurable variables. &Log("Info: ** Parameters **\n"); foreach $key (@Params) { # print STDERR "key $key\n"; eval "\$x = \$$key"; &Log("Info: $key: $x\n"); } ############################################################################## # Change directory to the root of the web site HTML source tree # (e.g. this dir should contain the site toplevel: http://server/index.html) if ( $Root_Dir eq "." ) { $Orig_Dir = $Root_Dir; } elsif ( $Root_Dir =~ m,^(http|ftp)://, ) { ; } else { use Cwd; # PERL5 dep # Unportability? With Perl Modules? Naaaaah.. if ( $[ <= 5.001 ) { $Orig_Dir = getcwd(); } else { $Orig_Dir = cwd(); } if ( ! -d "$Root_Dir" || ! chdir($Root_Dir) ) { die "Could not chdir to \"$Root_Dir\""; } &Log("Info: changed directory to $Root_Dir\n"); } ############################################################################## # If there are NO dirs or files on command line then do the whole tree # by starting at "." if ( !defined($ARGV[0]) || $ARGV[0] eq '' ) { push (@ARGV, "."); } $Link_Count = 0; # Global used counting each link found. $File_Count = 0; &Log("Info: FILES FOUND: File: []\n"); ############################################################################## # Retrieve html case. #if ( $Retrieve ne '' && ! -d $Retrieve ) { # #} ############################################################################## # Loop over all files or dirs on command line. foreach $arg (@ARGV) { ###################################################################### # Let's get busy! $start_dir = $arg; # N.B. start_dir could be a single file. if ( $start_dir =~ m,^/, ) { &Log("Info: WARNING: finding from absolute path may give unexpected results:\n"); &Log("Info: WARNING: find dir: $start_dir\n"); &Log("Info: WARNING: It is best to use relative paths from Web Site Server Root.\n"); } elsif ( $start_dir =~ m,^\.\., ) { &Log("Info: WARNING: It is assumed the current directory is the Web Site Server Root.:\n"); &Log("Info: WARNING: find dir: $start_dir\n"); &Log("Info: WARNING: Going up a directory could lead to unexpected results.\n"); } ###################################################################### # Open up a streaming list of found HTML files: if ( $Use_Internal_Find ) { local($m) = $File_Match; # Try to turn shell glob into regex... not very accurate. $m =~ s/\./\\./g; $m =~ s/\*/.*/g; # PERL5 dep local($cmd) = "use File::Find; sub w {\$f = \$File::Find::name; print \$f, \"\\n\" if \$f =~ /$m/;} find(\\&w, \"$start_dir\");"; # Might look like: # use File::Find; sub w {$f = $File::Find::name; print $f, "\n" if $f =~ /.*\.htm.*/;} find(\&w, "."); local($perl_cmd) = 'perl'; if ( $ENV{'MY_PERL'} ne '' ) { $perl_cmd = $ENV{'MY_PERL'}; } if ( -f $start_dir ) { # Special case if it is a single file local($sd) = $start_dir; if ( $sd !~ m,^\./, ) { $sd = "./$sd"; # Need ./filename } # Just echo it. open(HTML, "echo \"$sd\"|"); # Run perl5 File::Find cmd: } elsif ( $Unix ) { # Worry about $f, etc expanded by the shell? &Log("Info: Using perl internal Find cmd: $cmd\n"); open(HTML, "$perl_cmd -e \'$cmd\'|") || die "$!"; } else { &Log("Info: Using perl internal Find cmd: $cmd\n"); $cmd =~ s/"/\\"/g; open(HTML, "$perl_cmd -e \"$cmd\"|") || die "$!"; } } elsif ( $Unix ) { # Just use /bin/find local($sd) = $start_dir; if ( -f $start_dir ) { # Special case if it is a single file if ( $sd !~ m,^\./, ) { $sd = "./$sd"; # Need ./filename } } open(HTML, "find $sd -follow -name \"$File_Match\" -print|") || die "$!"; } else { # Hmmm... &Log("Info: OS not supported\n"); exit 1; } ###################################################################### # Loop over all HTML files found for this case: while () { chop($file = $_); $File_Count++; # Process the file: &Process($file); } close(HTML); # On to next command line file or dir. } ############################################################################## # Display lists of links found: if ( $Report_Local ) { &Report_Local(); exit 0; } # Log all of the anchors found: &Log("Info: ANCHORS FOUND: Anchor: | | \n"); foreach $key (sort(keys(%Anchor_FileUrl))) { $desc = $Anchor_FileUrl{$key}; &Log("Anchor: $key | $desc\n"); $Anchor_FileUrl_Count++; } # Log all of the anchors found: &Log("Info: IMAGES FOUND: Image: | | IMAGE\n"); foreach $key (sort(keys(%Image_FileUrl))) { $desc = $Image_FileUrl{$key}; &Log("Image: $key | $desc\n"); $Image_FileUrl_Count++; } # Log all of the anchors found: &Log("Info: FORM ACTIONS FOUND: Form: | | FORM\n"); foreach $key (sort(keys(%Action_FileUrl))) { $desc = $Action_FileUrl{$key}; &Log("Form: $key | $desc\n"); $Action_FileUrl_Count++; } ############################################################################## # If -links_only, we are done. if ($Links_Only) { # No checking, just print the links. &Log("Info: Found $Link_Count hyperlinks in $File_Count files.\n"); &Msg("$Program: list of links placed in file: $Log_File\n"); exit 0; } ############################################################################## # Get ready to Check the hyperlinks: $error_count = 0; # Initialize socket package. &Use_Ksockets(); ############################################################################## # Check if we spawn a bunch of slaves to make the network connections: # $Parallel = 0; $My_Job = -1; $Parallel = 1 if $Unix && $Number_of_Jobs > 1; $Slave_Msg = ''; $url_check_msg = "Info: URL CHECKS: Check: {OK:|ERR:} {LOCAL:|REMOTE:} | | \n"; &Log("$url_check_msg"); if ( $Parallel ) { # Many slave case: local($job, $pid); $My_Job = 0; # Parent is Job 0 $PidList[0] = 'me'; # Fork off the slave processes for ($job = 1; $job <= $Number_of_Jobs; $job++) { if ( $pid = fork) { $PidList[$job] = $pid; $PidMap{$pid} = $job; &Log("Info: Spawned Parallel job($pid): $job of $Number_of_Jobs\n"); &short_sleep("0.3"); } elsif (defined($pid)) { # Slave process must redirect it's logging $My_Job = $job; if ( defined($Log) ) { close($Log); } $Slave_Msg = "SLAVE($My_Job): "; $My_Log = "$Log_File.$My_Job"; open(PLOG, "/dev/null") if 0; # Just to avoid 'PLOG' typo message chdir($Orig_Dir) || die "${Slave_Msg}cannot chdir to Orig_Dir: \"$Orig_Dir\"\n"; open(PLOG, ">$My_Log") || die "${Slave_Msg}cannot open my log file \"$My_Log\"\n"; chdir($Root_Dir) || die "${Slave_Msg}cannot chdir to Root_Dir: \"$Root_Dir\"\n"; $Log = 'PLOG'; &short_sleep("1.0"); last; # Child exits the loop and gets to work. } else { # Could not fork. Bail out. &Msg("Error forking job $job. Exiting.\n"); local($p); # Kill children processes foreach $p (@PidList) { next if $p =~ /me/; kill 9, $p; } sleep 1; exit 1; } } } ############################################################################## # Check each of the various types of items: # if ( ! $Parallel || $My_Job > 0 ) { # Master skips this block # ANCHORS: $Check_Mode = 'Anchors'; $Check_Size = $Anchor_FileUrl_Count; if ( ! $Check_Anchors ) { &Log("Info: WARNING: Skipping Anchors Check\n"); } else { &Check_Anchors(); } # ACTIONS: $Check_Mode = 'Actions'; $Check_Size = $Action_FileUrl_Count; if ( ! $Check_Actions ) { &Log("Info: WARNING: Skipping Actions Check\n"); } else { &Check_Actions(); # N.B. Not implemented. } # IMAGES: $Check_Mode = 'Images'; $Check_Size = $Image_FileUrl_Count; if ( ! $Check_Images ) { &Log("Info: WARNING: Skipping Images Check\n"); } else { &Check_Images(); } } ############################################################################## # If doing checks in parallel, here is wrap up work. # if ( $Parallel ) { if ( $My_Job > 0 ) { # Slave closes log file &Msg("${Slave_Msg}I, $$, am DONE.\n"); close($Log); exit 0; } elsif ( $My_Job == 0 ) { local($pid); # Master waits for all Slaves to finish foreach $pid (@PidList) { next if $pid eq 'me'; &Msg("MASTER: wait for $pid\n"); $w = waitpid($pid, 0); &Msg("MASTER: got SLAVE($PidMap{$pid}) $pid with $w\n"); } # Gather all the Slave output together. local($job, $pfile); chdir($Orig_Dir) || die "MASTER: cannot chdir to Orig_Dir: \"$Orig_Dir\"\n"; for ($job = 1; $job <= $Number_of_Jobs; $job++) { $pfile = "$Log_File.$job"; open(JOB, "$pfile") || die "MASTER: problem opening job $job output file $pfile: $!"; while () { &Log($_); } close(JOB); unlink($pfile); } chdir($Root_Dir) || die "MASTER: cannot chdir to Root_Dir: \"$Root_Dir\"\n"; } } ############################################################################## # Report results to the user: # &Log("Info: Found $Link_Count hyperlinks in $File_Count files.\n"); close($Log) if defined($Log); &Msg("$Program: Full output placed in file: $Log_File\n"); &Msg("$Program: Here are the errors found (if any).\n"); &Msg("$Program: Errors will also be put in file: $Log_File.errs\n"); # We must go back to original dir to find the Log file!! if ( $Root_Dir ne '.' ) { if ( ! -d "$Orig_Dir" || ! chdir($Orig_Dir) ) { die "Could not chdir to \"$Orig_Dir\""; } } # Open the log file for reading, and the error file for writing: open(OUTPUT, "$Log_File") || die "cannot open \"$Log_File\""; # Collect warnings and errors and print them # to STDOUT and error log file: $total = 0; $warnings = ''; $errors = ''; while () { $total++ if /^check:/i; if ( /^check: err:/i ) { $errors .= $_; $error_count++; } elsif ( /^check:.*warning:/i ){ if ( ! /: Skip/ ) { $warnings .= $_; } } } close(OUTPUT); print STDOUT "*** ERRORS ***\n$errors"; print STDOUT "*** WARNINGS ***\n$warnings"; open(ERRS, ">$Log_File.errs") || die "cannot open \"$Log_File.errs\""; print ERRS $url_check_msg; print ERRS "*** ERRORS ***\n$errors"; print ERRS "*** WARNINGS ***\n$warnings"; $total = 1 if $total == 0; ###################################################################### # Calculate percentage dead links: # $percent = (100.0 * $error_count)/$total; $percent = sprintf("%.1f", $percent); $t1 = "$Program: Found $error_count errors out of $total checked links (${percent}%).\n"; $t2 = "$Program: Output placed in files: $Log_File and $Log_File.errs\n"; $Finish_Time = &date; $t3 = "Info: Start: $Start_Time\nInfo: Finish: $Finish_Time\n"; &Msg("$t1"); &Msg("$t2"); &Msg("$t3"); ###################################################################### # Shove this info also at bottom of log files # open(OUTPUT, ">>$Log_File") || die "$!"; print OUTPUT "Info: $t1"; print OUTPUT "Info: $t2"; print OUTPUT "$t3"; close(OUTPUT); print ERRS "Info: $t1"; print ERRS "$t3"; close(ERRS); ############################################################################## # E-mail errors to people # if ( $Who_To_Email ne '' && $Unix ) { local($cnt) = 0; local($tmpfile) = ''; if ( $Who_To_Email =~ s/\s*GT:(\d+)\s*// ) { $cnt = $1; } local($mail_file) = "$Log_File.errs"; if ( $Who_To_Email =~ s/\s*WRAP[,]*\s*// ) { $tmpfile = "/tmp/cobmail.$$"; system("$0 -wrap $mail_file > $tmpfile"); $mail_file = $tmpfile; } $Who_To_Email =~ s/,/ /g; if ( $error_count > 0 ) { if ( $cnt == 0 || $error_count > $cnt ) { $cmd = "$Mail_Command -s \"$Mail_Subj($error_count)\" $Who_To_Email < $mail_file"; &Msg("Mail($error_count,$cnt): $cmd\n") if $Debug; system("$cmd"); } } unlink($tmpfile) if $tmpfile ne ''; } exit $error_count; ############################ Subroutines: ############################# sub Process { ############################################################## # Process a single HTML file. ############################################################## local($file) = @_; # filename is input argument. local($contents) = ''; # Check if it is a Backup file and should be skipped. if ( $Skip_Backups ) { local($match) = $File_Skip_Match; ### $match s/(\W)/\\$1/g; if ( $file =~ m,$match,o ) { &Log("File: $file WARNING: Skipping: it is backup or some other thing to skip.\n"); return; } } # Check if it is even a regular file: if ( ! -f $file ) { &Log("File: $file WARNING: Skipping: it is not a file.\n"); return; } # Try to open, return on failure: if ( ! open(FILE, "$file") ) { &Log("File: $file WARNING: Skipping: cannot open it.\n"); return; } # Log the filename: &Log("File: $file\n"); # Gather up the whole contents of the file in the string $contents while() { chop; $_ =~ s/\r$//; $contents .= $_ . $NL; # N.B. Newlines are replaced. } close(FILE); # Now go and parse the file contents looking for hyperlinks. &Parse_File($contents, $file); undef($contents); } sub Parse_File { ############################################################## # Parse the contents of a single HTML file, extracting and # logging all hyperlinks found inside. ############################################################## local($contents, $file) = @_; # file contents and filename # are input arguments. local($url, $desc, $action, $image); local($base); # Remove HTML comments: if ( $Remove_Comments ) { # this misses server include formatting. # e.g. with temporary strings. $contents =~ s,\<\s*\!\-\-,$CB,og; $contents =~ s,\-\-\s*\>,$CE,og; # Delete stuff between the temporary string pairs. $contents =~ s,$CB.*?$CE,,og; # PERL5 dep } # Replace the newline temporary string with a space. $contents =~ s/$NL/ /og; local($i) = 0; # For counting hyperlinks found. # Check for HTML base. if ($contents =~ m,\<\s*BASE\s+[^\>]*HREF\=([^\s\>]+)[^\>]*\>,io) { $base = $1; } else { # Otherwise, use the location of file in the HTML tree. $base = &base_href($file); } # Record the base found above. $Base_Href{$file} = $base; local($base_name) = &base_name($file); # Loop until no more anchors found: while (1) { # Check for anchor: if ($contents =~ # s,\<\s*A\s+[^\>]*HREF\=([^\s\>]+)[^\>]*\>([^\<]*)\<\s*/\s*A\s*\>,,io s,\<\s*A\s+[^\>]*HREF\=([^\s\>]+)[^\>]*\>(.*?)\<\s*/\s*A\s*\>,,io ) { # match: Description # $1 $2 # Record the URL and Description found: $url = $1; $url = &trim_ends($url); if ( $url =~ /^#/ ) { $url = "$base_name$url"; } $desc = $2; if ( !defined($desc) ) { $desc = ''; # warning message? } $desc = 'NONE' if $desc eq ''; $desc =~ s/\|/$PIPE/og; &Log_Anchor($url, $desc, $file); $i++; } else { # No match? Then done with loop. last; } } # Loop until no more images found: while (1) { # Check for image: if ($contents =~ s,\<\s*IMG\s+[^\>]*SRC\=([^\s\>]+)[^\>]*\>,,io ) { # match: # $1 # Record the URL of the image: $image = $1; $image = &trim_ends($image); &Log_Image($image, $file); $i++; } else { # No match? Then done with loop. last; } } while (1) { if ($contents =~ s,\<\s*FORM\s+[^\>]*ACTION\=([^\s\>]+)[^\>]*\>,,io ) { # match: # $1 # Record the URL of the action: $action = $1; $action = &trim_ends($action); &Log_Action($action, $file); $i++; } else { # No match? Then done with loop. last; } } undef($contents); } sub Get_Href_Names { ############################################################## # Parse the contents of a single HTML file, extracting and # logging all NAME ANCHORS found inside. ############################################################## local($contents) = @_; # file contents is input argument. local(@Names); local($url, $desc, $action, $image); local($name); $contents =~ s/\r\n/ /og; $contents =~ s/\n/ /og; # Remove HTML comments: if ( $Remove_Comments ) { # this misses server include formatting. # e.g. with temporary strings. $contents =~ s,\<\s*\!\-\-,$CB,og; $contents =~ s,\-\-\s*\>,$CE,og; # Delete stuff between the temporary string pairs. $contents =~ s,$CB.*?$CE,,og; # PERL5 dep } local($i) = 0; # For counting href Names found. # Loop until no more anchors found: while (1) { # Check for anchor: if ($contents =~ s,\<\s*A\s+[^\>]*NAME\=([^\s\>]+)[^\>]*\>,,io ) { # match: # $1 $name = $1; $name = &trim_ends($name); $Names[$i++] = $name; } else { # No match? Then done with loop. last; } } return (@Names); } sub Has_Href_Name { ############################################################## # Check if an HTML file has a specified NAME Anchor. ############################################################## local($contents, $name) = @_; local($nm); local($ok, @Names); @Names = &Get_Href_Names($contents); $ok = 0; $name =~ s/(\W)/\\$1/g; foreach $nm (@Names) { if ( $nm =~ /^${name}$/i ) { $ok = 1; last; } } return $ok; } sub Log_Anchor { ############################################################## # Record a single Anchor Url in global Hash Table ############################################################## # Input is is the URL, its Description, and the Filename it came from. local($url, $desc, $file) = @_; local($c, $cf); $url = &trim_ends($url); # Remove leading and trailing spaces $desc = &trim_ends($desc); # and quotation marks. $c = $Link_Count++; # Global count: $cf = $File_Count{$file}++; # Count for the file: # $List_Url[$c] = $url; # Not used... # $List_Desc[$c] = $desc; # $List_File[$c] = $file; # A debugging message: # &Msg("LOG($c): $file : $url\n"); # Record the info in two hashes: # $Anchor_Url{$url} = "$file | $desc"; $Anchor_FileUrl{"$file | $url"} = $desc; } sub Log_Action { ############################################################## # Record a single Form Action Url in global Hash Table ############################################################## # Input is the action URL and the Filename it came from. local($action, $file) = @_; local($c, $cf); local($desc) = 'FORM'; # Set description to "FORM" $action = &trim_ends($action); # Remove some spaces and quotes. $c = $Link_Count++; # Global count. $cf = $File_Count{$file}++; # Count for the file. # $List_Action[$c] = $action; # Not used. # $List_Desc[$c] = $desc; # $List_File[$c] = $file; # Record the info in two hashes: # $Action_Url{$action} = "$file | $desc"; $Action_FileUrl{"$file | $action"} = $desc; } sub Log_Image { ############################################################## # Record a single Image Url in global Hash Table ############################################################## # Input is the image URL and the Filename it came from. local($image, $file) = @_; local($c, $cf); local($desc) = 'IMAGE'; $image = &trim_ends($image); # Remove some spaces and quotes. $c = $Link_Count++; # Global count. $cf = $File_Count{$file}++; # Count for the file. # $List_Image[$c] = $image; # Not used. # $List_Desc[$c] = $desc; # $List_File[$c] = $file; # Record the info in two hashes: # $Image_Url{$image} = "$file | $desc"; $Image_FileUrl{"$file | $image"} = $desc; } sub Check_Anchors { ############################################################## # Loops over all Anchors found. Calls the checker subroutine # and logs the results. ############################################################## # No Input. local($key, $file, $url); local($ok, $msg); local($count) = 0; # Loop over all Anchors found: foreach $key (sort(keys(%Anchor_FileUrl))) { # If I am a slave process, maybe I should skip this one: next if &Job_Skip($count++); # Separate file and url: ($file, $url) = split(/\s*\|\s*/, $key, 2); # Go check the url: ($ok, $msg) = &Check_Url($file, $url); next if $Report_Local; # Log the result: if ( $ok ) { &Log("Check: OK: $msg:\t| $key\n"); } else { &Log("Check: ERR: $msg:\t| $key\n"); } } } sub Check_Images { ############################################################## # Loops over all Images found. Calls the checker subroutine # and logs the results. ############################################################## # No Input. local($key, $file, $url); local($ok, $msg); local($count) = 0; # Loop over all Images found: foreach $key (sort(keys(%Image_FileUrl))) { # If I am a slave process, maybe I should skip this one: next if &Job_Skip($count++); # Separate file and url: ($file, $url) = split(/\s*\|\s*/, $key, 2); # Go check the url: ($ok, $msg) = &Check_Url($file, $url); next if $Report_Local; # Log the result: if ( $ok ) { &Log("Check: OK: $msg:\t| $key\n"); } else { &Log("Check: ERR: $msg:\t| $key\n"); } } } sub Check_Actions { ############################################################## # Loops over all Form Actions found. Calls the checker subroutine # and logs the results. ############################################################## &Log("Info: Check_Actions not implemented\n"); } sub Report_Local { &Check_Anchors; &Check_Images; } sub Job_Skip { ############################################################## # When n Slaves are checking hyperlinks in parallel, this # function determines if a given slave should skip a given # hyperlink. Each slave skips over n-1 hyperlinks. ############################################################## local($count) = @_; if ( ! $Parallel ) { &Msg("Count: $count of $Check_Size. ($Check_Mode).\n") if $Debug; return 0; # Not Parallel: so don't skip. } if ( $My_Job == 0 ) { return 1; # Parallel and I am Master: so skip. } local($class); # Do modulo to determine if this is one I do. $class = $count % $Number_of_Jobs; if ( $class == $My_Job - 1 ) { &Msg("${Slave_Msg}class: $class, count=$count of $Check_Size. I DO IT ($Check_Mode).\n") if $Debug; return 0; # It's in my class: I can't skip. } else { return 1; # Somebody else's class: so skip. } } sub Check_Url { ############################################################## # Check a single URL ############################################################## # Input is the filename the link is in, and the URL of the link. local($file, $url) = @_; local($location); local($host, $port, $path); local($ok, $msg); # Get the Base Location of the file involved # (to determine relative paths, etc.): local($base_href) = $Base_Href{$file}; # Get the dir name of the file as well: local($dir) = &dir_name($file); &short_sleep(0.2) if $Parallel; # Check if link is a Remote or Local reference. if ( ! &remote_reference($url) ) { # Case Local: $location = 'LOCAL'; # Unset host and port for local file: $host = ''; # Not used. $port = ''; # Not used. # Look for relative or absolute reference: if ( $url =~ m,^/, ) { # not needed since did chdir() # $path = "${Root_Dir}/$url"; $path = '.' . $url; ; } else { $path = "$base_href/$url"; } } else { # Case Remote: $location = 'REMOTE'; } if ( $Report_Local ) { if ( $location eq 'LOCAL' ) { # &Log("$path\n"); print STDOUT "$path\n"; } return (1, ''); } # &Log("Info: checking URL: $url\n") if $Debug; # Now go try to find the URL local($umatch) = $Url_Skip_Match; ### $umatch =~ s/(\W)/\\$1/g; if ( $Url_Skip_Match ne '' && $url =~ m,$umatch,o ) { ($ok, $msg) = (1, "WARNING: $location: Skip Match: ($&)"); } elsif ( $location eq 'LOCAL' ) { # Check Local: ($ok, $msg) = &local_url_ok($path); } else { # Check Remote: ($ok, $msg) = &remote_url_ok($url); &Log("Info: remote_url_ok returned: $ok, $msg\n") if $Debug; } # Return whatever status and message was received. return ($ok, $msg); } sub remote_reference { ############################################################## # Checks if a URL is a remote reference. ############################################################## # Input is the URL. local($url) = @_; local($remote) = ''; # Look for http://, ..., mailto:..., etc. if ( $url =~ m,^(\w+)://, ) { $remote = $1; } elsif ( $url =~ /^mailto:/ ) { $remote = 'mailto'; } elsif ( $url =~ m,^telnet://, ) { $remote = 'telnet'; } elsif ( $url =~ m,^rlogin://, ) { $remote = 'rlogin'; } # Return the case found, '' means not remote. return $remote; } sub local_url_ok { ############################################################## # Checks if a Local URL actually exists. ############################################################## # Input is the file path indicated by URL. local($path) = @_; local($lmsg) = "LOCAL :"; if ( !defined($path) ) { &Msg("Info: local_url_ok: undefined \$path\n"); # if $Debug; ? $path = ''; } local($ok, $msg, $url); local($has_name) = ''; # Check if we should use HTTP to check this URL # even though it should be a local file. if ( $Use_Httpd ne '' ) { if ( $path =~ m,^\./, ) { # Trim leading . if ./ $path =~ s,^\.,,; } if ( $path =~ m,^/, ) { # Trim leading / $path =~ s,^/,,; } # Construct url out of local file. $url = "${Use_Httpd}/${path}"; &Log("Info: local_url_ok: Use_Httpd URL: \"$url\"\n") if $Debug; # So go check as though REMOTE URL ($ok, $msg) = &remote_url_ok($url, $Use_Httpd); $msg =~ s/REMOTE: //; $msg = "$lmsg (CHECKED VIA HTTP): $msg"; return ($ok, $msg); } # For local files we will always keep ../../ ?? &Log("Info: local_url_ok: looking for file: \"$path\"\n") if $Debug; # Remove the part: foo.html#name if ( $path =~ /#(.*)$/ ) { # Remember it to check later $has_name = $1; $path =~ s/#.*$//; } if ( $path =~ m,^/, ) { # Prepend . to leading / $path = '.' . $path; } # We cannot resolve /~user references (see -use_httpd) if ( $path =~ m,/\~, ) { return (1, "$lmsg WARNING: Skipping: contains ~user WWW dir"); } # Use filesystem to check for existence. # N.B. does not check readability. if ( -d $path ) { if ( $has_name ne '' ) { return (1, "$lmsg WARNING: Directory Exists ($path), but skipping NAME (#$has_name)"); } else { return (1, "$lmsg Directory Exists ($path)"); } } elsif ( -f $path ) { if ( $has_name ne '' ) { if ( ! open(FNAME, "$path") ) { return (1, "$lmsg WARNING: File Exists ($path), but could not open to check NAME"); } # Read contents to find NAME anchor local($ok, $contents); $contents = ''; while () { $contents .= $_; } close(FNAME); # Check for NAME anchor $ok = &Has_Href_Name($contents, $has_name); if ( $ok ) { return (1, "$lmsg File Exists ($path), and found NAME (#$has_name)"); } else { return (0, "$lmsg File Exists ($path), but COULD NOT find NAME (#$has_name)"); } } else { return (1, "$lmsg File Exists ($path)"); } } else { return (0, "$lmsg File Does Not Exist ($path)"); } } sub remote_url_ok { ############################################################## # Checks if a Remote URL actually exists. ############################################################## # Input is the URL local($url, $preserve) = @_; local($rmsg) = "REMOTE:"; local($ok, $msg); local($has_name) = ''; local($header) = ''; # Get type of remote reference: local($type) = &remote_reference($url); # Handle various cases: if ( ! $Check_Remotes ) { # Not supposed to check remotes: return (1, "$rmsg WARNING: Skip Remotes: $url"); } elsif ( ! $type ) { # Not a http://, mailto, ... etc. return (0, "$rmsg UNKNOWN TYPE: $url"); } elsif ( $type eq 'mailto' ) { # Case: mailto if ($Check_Mailto) { return &mailto_ok($url); } else { return (1, "$rmsg WARNING: Skip MAILTO: $url"); } } elsif ( $type eq 'telnet' ) { # Case: telnet if ($Check_Telnet) { return &telnet_ok($url); } else { return (1, "$rmsg WARNING: Skip TELNET: $url"); } } elsif ( $type eq 'rlogin' ) { # Case: rlogin if ($Check_Rlogin) { return &rlogin_ok($url); } else { return (1, "$rmsg WARNING: Skip RLOGIN: $url"); } } elsif ($type =~ /^gopher$/i ) { local($host, $port, $path, $user) = &extract_host($url); return &gopher_ok($host, $port, $path, $rmsg); } elsif ( $type =~ /^ftp$/i ) { local($host, $port, $path, $user) = &extract_host($url); return &ftp_ok($host, $port, $path, $user, $rmsg); } # Check the HTTP case: local($host, $port, $path, $user); # Remove upward ../.. references: if ( ! $Keep_Dots ) { local($orig) = $url; $url = &remove_url_dotdots($url, $preserve); if ( $url ne $orig ) { $rmsg .= " :"; } } local($version) = "HTTP/1.0"; # Version of HTTP we use. local($get_line) = ''; # Remove the part: foo.html#name if ( $url =~ /#(.+)$/ ) { # And save NAME to check later $has_name = $1; $url =~ s/#.*$//; } else { $has_name = ''; } local($get_method) = 'GET'; if ( ! $has_name && $Head_Only ) { $get_method = 'HEAD'; } # Parse the URL for host, port, and path ($host, $port, $path, $user) = &extract_host($url); &Log("Info: remote_url_ok: trying: host=$host, port=$port, path=$path\n") if $Debug; local($rc, $msg, $header, $try_url, $body, $i); $try_url = $url; local(@results); for ($i=0; $i < 4; $i++) { # try up to 4 redirs. ($rc, $msg, $result, $header, $body) = &fetch_http($try_url, $get_method, $version); push(@results, $result); $rmsg .= " $msg"; if ( ! $rc ) { return (0, "$rmsg: $url"); } elsif ( &http_status($result, '2\d\d') ) { last; } # some error case, 300, 400, ... Look for a redir: local($line, $loc); $loc = ''; foreach $line (split(/; /, $header)) { if ( $line =~ /^Location:\s+/ ) { $loc = $'; } } if ( $loc ne '' ) { if ( $loc !~ m,(\w+)://, ) { if ( $loc =~ m,^/, ) { $loc = "$type://$host$loc"; } else { # XXX? $loc = "$type://$host/$loc"; } } &Log("Info: Following REDIR: $loc: INITIAL_RESULT: $result\n"); $try_url = $loc; } else { last; } } local($res); foreach $res (@results) { if ( &http_status($res, '301') ) { $rmsg .= " REDIR WARNING: \($res)"; } } if ( $has_name ne '' && &http_status($result, '2\d\d') ) { # Check for the NAME anchor: local($ok); $ok = &Has_Href_Name($body, $has_name); if ( $ok ) { return (1, "$rmsg $result, and found NAME (#$has_name)"); } else { return (0, "$rmsg $result, but COULD NOT find NAME (#$has_name)"); } } # Check final result code if ( &http_status($result, '2\d\d') || &http_status($result, '302') ) { # is OK (return code= 200) 302 is "Found, or Moved Temporarily" return (1, "$rmsg $result"); } elsif ( &http_status($result, '3\d\d') ) { # is FOUND (return code= 30*) return (1, "$rmsg REDIR WARNING: $result"); } elsif ( &http_status($result, '4\d\d') ) { # is FOUND (return code= 40*) return (0, "$rmsg FOUND CLIENT ERROR: $result"); } elsif ( &http_status($result, '5\d\d') ) { # is FOUND (return code= 50*) return (0, "$rmsg FOUND SERVER ERROR: $result"); } else { # is ERROR (otherwise) return (0, "$rmsg FOUND ERROR: $result"); } } sub fetch_http { local($url, $get_method, $version) = @_; local($host, $port, $path, $user) = &extract_host($url); local($host_hdr) = "Host: $host\r\n"; local($proxy_msg) = ''; # Message to indicate we are using a proxy local($get_line); # Decide the connection host+port and the GET line we will send to it: if ( $Proxy_Host eq '' || $Proxy_Host =~ /^none$/i ) { # DIRECT CONNECTION CASE $get_line = "$get_method $path $version\r\n"; } else { # ADDITIONAL PROXY CONNECTION CASE # go via HTTP thru proxy $host = $Proxy_Host; $port = $Proxy_Port; $get_line = "$get_method $url $version\r\n"; $proxy_msg = "(PROXY=$host:$port): "; } # Open a TCP socket to the remote host: local($HTTP); local($i); local($max) = 3; for ($i=1; $i <= $max; $i++) { ($HTTP) = &Connect($host, $port); # See if connection was OK: if (&SocketError($HTTP)) { if ( $i != $max && $HTTP =~ /Connect:gethostbyname/ ) { sleep 1; next; } # Clean up error msg a bit: local($msg) = $HTTP; $msg =~ s/main\'KSOCKETSCONNECT/SOCK/g; $msg =~ s/Ksockets://g; return (0, "${proxy_msg}COULD NOT CONNECT ($host,$port): $msg"); } else { last; } } # Unbuffer output stream: &unbuffer($HTTP); # Send the request: print $HTTP $get_line; print $HTTP $host_hdr; print $HTTP $User_Agent_Lines if $User_Agent_Lines; print $HTTP "\r\n"; if ( $Shutdown ) { shutdown($HTTP, 1); &Msg("Shutdown: $url\n"); } elsif ( $Shutdown_Match ne '' ) { if ( $url =~ /$Shutdown_Match/o ) { shutdown($HTTP, 1); &Msg("Shutdown: $url\n"); } } local($result, $header); $result = ''; $header = ''; # Get the first HTTP line sent back: if ( $Timeout > 0 ) { $Signal_Result = ''; chop($result = &read_line($HTTP)); if ( $Signal_Result ne '' ) { $result .= $Signal_Result; $Signal_Result = ''; } } else { $result = <$HTTP>; chop($result); } if ( !defined($result) ) { $result = ''; close($HTTP); &Msg("Msg: read returned UNDEFINED.\n"); return (0, "${proxy_msg}read UNDEFINED."); } elsif ( $result =~ s/^$COBERR\s*//o ) { close($HTTP); return (0, "${proxy_msg}$result"); } # Clean the end of the string a bit: $result =~ s/\r//g; # Get the rest of the HTTP header local($location); local($length) = ''; while (<$HTTP>) { last if /^\r\n/; last if /^\n/; chop($_); $_ =~ s/\r//g; if ( $_ =~ /^Location:\s+/i ) { $location = $'; } elsif ( $_ =~ /Content-Length:\s+/i ) { $length = $'; } $header .= $_ . '; '; } $header =~ s/; $//; local($hmsg) = $get_line; $hmsg =~ s/[\r\n]//g; $hmsg = $proxy_msg . " HOST=$host: PORT=$port: \[HEADER: " . ">>> $hmsg; " . $header . ']'; if ( $get_method eq 'HEAD' || ! &http_status($result, '2\d\d') ) { close($HTTP); return (1, $hmsg, $result, $header); } # Download the whole document: local($contents) = ''; if ( $length =~ /^\d+$/ ) { read( $HTTP, $contents, $length); } else { while (<$HTTP>) { $contents .= $_; } } close($HTTP); return (1, $hmsg, $result, $header, $contents); } sub gopher_ok { ############################################################## # Minimal support for gopher protocol. ############################################################## local($host, $port, $path, $rmsg) = @_; local($GOPHER, $get_line); $port = 70 unless $port; # No leading / $path =~ s,^/,,; # Hack: single digit if ( $path =~ /^\d\d/ ) { $path =~ s/^\d//; } $path =~ s/%20/ /g; $get_line = $path; # Open a TCP socket to the remote host: ($GOPHER) = &Connect($host, $port); # See if connection was OK: if (&SocketError($GOPHER)) { # close $GOPHER; local($msg) = $GOPHER; # Clean up error msg a bit: $msg =~ s/main\'KSOCKETSCONNECT/SOCK/g; $msg =~ s/Ksockets://g; return (0, "$rmsg COULD NOT CONNECT ($host,$port): $msg"); } # Unbuffer output stream: &unbuffer($GOPHER); # Send the request: print $GOPHER $get_line; print $GOPHER "\r\n"; local($result); $result = ''; # Get the first GOPHER line sent back: if ( $Timeout > 0 ) { $Signal_Result = ''; chop($result = &read_line($GOPHER)); if ( $Signal_Result ne '' ) { $result .= $Signal_Result; $Signal_Result = ''; } } else { chop($result = <$GOPHER>); } if ( !defined($result) ) { $result = ''; close($GOPHER); &Msg("Msg: read returned UNDEFINED.\n"); return (0, "$rmsg read UNDEFINED."); } elsif ( $result =~ s/^$COBERR\s*// ) { return (0, "$rmsg $result"); } elsif ( $result eq '' ) { $result = ''; # how to get quotes? } # Clean the end of the string a bit: $result =~ s/\r//g; close($GOPHER); # Minimal check for OK if ($result =~ /cannot access|error\.host|sigalrm/i && $result ne '') { return (0, "$rmsg $result"); } else { return (1, "$rmsg $result"); } } sub ftp_ok { ############################################################## # Simple dialog to check for existence of FTP file # There has to be a better way than RETR... ############################################################## local($host, $port, $path, $user, $rmsg) = @_; local($FTP, $get_line, $passwd, $private); # switch for private passwd $private = 0; $port = 21 unless $port; # Open a TCP socket to the remote host: ($FTP) = &Connect($host, $port); # See if connection was OK: if (&SocketError($FTP)) { # close $FTP; local($msg) = $FTP; # Clean up error msg a bit: $msg =~ s/main\'KSOCKETSCONNECT/SOCK/g; $msg =~ s/Ksockets://g; return (0, "$rmsg COULD NOT CONNECT ($host,$port): $msg"); } # Work out User and Password: $passwd = $Ftp_Passwd; if ( $user eq '' ) { $user = 'ftp'; } elsif ( $user =~ /^([^:]+):(.*)$/ ) { $user = $1; $passwd = $2; $private = 1; } # Unbuffer output stream: &unbuffer($FTP); local($result, $rtmp); $result = ''; # Get the first FTP line sent back: if ( $Timeout > 0 ) { $Signal_Result = ''; chop($result = &read_line($FTP)); if ( $Signal_Result ne '' ) { $result .= $Signal_Result; $Signal_Result = ''; } } else { chop($result = <$FTP>); } if ( !defined($result) ) { $result = ''; close($FTP); &Msg("Msg: read returned UNDEFINED.\n"); return (0, "$rmsg read UNDEFINED."); } elsif ( $result =~ s/^$COBERR\s*// ) { close($FTP); return (0, "$rmsg $result"); } elsif ( $result !~ /^220/ ) { print $FTP "QUIT\n"; close($FTP); return (0, "$rmsg $result"); } elsif ( $result eq '' ) { $result = ''; # how to get quotes? } # Clean the end of the string a bit: $result =~ s/\r//g; $rtmp = $result; $result = "[DIALOG: $result; "; # Loop over rest of Greeting: if ( $rtmp =~ /^220-/ ) { while (1) { chop($rtmp = <$FTP>); $rtmp =~ s/\r//g; $result .= "$rtmp; "; if ( $rtmp !~ /^220/ ) { print $FTP "QUIT\n"; close($FTP); return (0, "$rmsg BAD GREETING. $result]"); } last if $rtmp !~ /^220-/; } } # Send USER and loop over rest of reply: print $FTP "USER $user\n"; $result .= ">>> USER $user; "; while (1) { chop($rtmp = <$FTP>); $rtmp =~ s/\r//g; $result .= "$rtmp; "; if ( $rtmp !~ /^331/ ) { print $FTP "QUIT\n"; close($FTP); return (0, "$rmsg BAD PASSWD PROMPT. $result]"); } last if $rtmp !~ /^331-/; } # Send password and loop over rest of reply: print $FTP "PASS $passwd\n"; if ( $private ) { $result .= ">>> PASS ; "; } else { $result .= ">>> PASS $passwd; "; } while (1) { chop($rtmp = <$FTP>); $rtmp =~ s/\r//g; $result .= "$rtmp; "; if ( $rtmp !~ /^230/ ) { print $FTP "QUIT\n"; close($FTP); return (0, "$rmsg BAD LOGIN. $result]"); } last if $rtmp !~ /^230-/; } # Send the request for the Directory: print $FTP "RETR $path\n"; $result .= ">>> RETR $path; "; chop($rtmp = <$FTP>); $rtmp =~ s/\r//g; $result .= "$rtmp; "; # Close it we are done. print $FTP "QUIT\n"; close($FTP); # No looping, just look at single returned line. if ( $rtmp =~ /^425/ ) { return (1, "$rmsg FILE LOOKS OK. $result]"); } elsif ( $rtmp =~ /^550/ ) { if ( $rtmp =~ /not.*plain.*file/i ) { return (1, "$rmsg DIR LOOKS OK. $result]"); } return (0, "$rmsg PROBLEM WITH FILE/DIR. $result]"); } } sub http_status { ############################################################## # Generic routine to check HTTP return status ############################################################## local($result, $match) = @_; local($http_version) = ''; if ( $result =~ m,^\s*(HTTP/\d+\.\d+)\s+(${match}), ) { $http_version = $1; return $2; } else { return ''; } } sub remove_url_dotdots { ############################################################## # Resolve ../.. references from a URL ############################################################## local($url, $preserve) = @_; local($host, $path, $url0); $url0 = $url; $url =~ s,/\./,/,g; if ( $url =~ m,^(\w+\://[^/]+)(.*)$, ) { $host = $1; $path = $2; } else { $host = ''; $path = $url; } $path = &remove_dotdots($path); # Now get rid of leading ..'s $path =~ s,^/(\.\./)+,/,; $path =~ s,^(\.\./)+,/,; if ( $host ne '' ) { if ( $path !~ m,^/, ) { $url = "$host/$path"; } else { $url = "${host}${path}"; } } else { $url = "${host}${path}"; } if ( defined($preserve) && $preserve ne '' ) { local($match) = $preserve; local($url1) = $url; $match =~ s/(\W)/\\$1/g; if ( $url !~ m,^$match, ) { if ( $url =~ m,^(\w+\://[^/]+)(.*)$, ) { $host = $1; $path = $2; } else { $host = ''; $path = $url; } $host = $preserve; if ( $host ne '' ) { if ( $path !~ m,^/, ) { $url = "$host/$path"; } else { $url = "${host}${path}"; } } else { $url = "${host}${path}"; } &Msg("UrlDotDot: PRESERVED $url1 => $url\n") if $Debug; } } if ( $url0 ne $url ) { &Msg("UrlDotDot: SUBSTITUTED $url0 => $url\n") if $Debug; } return $url } sub remove_dotdots { ############################################################## # Resolve ../.. references from a PATH ############################################################## local($path) = @_; local($orig) = $path; local(@path1, @path2, $section, $tmp); local($pre, $post) = ('', ''); # Remove and record leading or trailing slashes. $pre = '/' if $path =~ s,^[/]+,,; $post = '/' if $path =~ s,[/]+$,,; foreach $section (split(/\//, $path)) { next if $section eq '.'; # Skip /./ next if $section eq ''; # Skip // push @path1, $section; # Push onto new path } # Loop over each section. Couldn't find a REGEX loop. foreach $section (@path1) { if ( $section eq '..' ) { # Take care of a '..' reference # Must have something to delete if ( defined(@path2) ) { $tmp = $path2[$#path2]; if ( defined($tmp) && $tmp ne '' && $tmp ne '..' ) { # Delete last section of path2 if not another '..' or # empty ''. It can only be empty from a pop I think. pop @path2; next; } } } push @path2, $section; } # Restore leading and trailing slashes (if any). $path = $pre . join('/', @path2) . $post; return $path; } sub Alarm_Handler { ############################################################## # SIGALRM Handler. Close the current filehandle. Hopefully # will kill a blocked pipe. ############################################################## &Log("Info: SIGALRM: Caught signal.\n"); $Signal_Result = "Caught SIGARLM: Probably timeout on hung connection"; if ( $Current_Handle ne '' ) { &Log("Info: SIGALRM: closing $Current_Handle\n"); close($Current_Handle); $Current_Handle = ''; } else { &Log("Info: SIGALRM: no handle to close.\n"); } } sub read_line { ############################################################## # Read a line from a filehandle. Set an alarm to break up # hung connections. Another alternative is to use select() ############################################################## local($fh) = @_; local($line) = ''; local($select_sleep) = 5; local($select_tries) = 2; if ( $Wait_Method eq 'alarm' && $Timeout > 0 ) { # Use alarm() to wake up: $SIG{'ALRM'} = 'Alarm_Handler'; $Current_Handle = $fh; &Msg("Msg: Setting alarm($Timeout)...\n") if $Debug; alarm($Timeout); &Msg("Msg: alarm is set.\n") if $Debug; } elsif ( $Wait_Method eq 'select' && $Timeout > 0 ) { # Use select() to see if handle ready: &Msg("Msg: Trying select($Timeout)...\n") if $Debug; local($ready, $timeleft, $i, $j); local($select_ok) = 0; for ($i = 0; $i < $select_tries; $i++) { ($ready, $timeleft) = &Ready($fh, $Timeout); if ( !defined($ready) || ! $ready ) { $j = $i + 1; &Msg("Msg: select($Timeout) try $j FAILED, timeleft: $timeleft. Sleeping $select_sleep...\n") if $Debug; sleep $select_sleep; } else { &Msg("Msg: select($Timeout) OK, timeleft: $timeleft.\n") if $Debug; $select_ok = 1; last; } } if ( ! $select_ok ) { &Msg("Msg: COULD NOT select($Timeout) FILEHANDLE.\n") if $Debug; return "${COBERR}select($Timeout) on socket read FAILED\n"; } } $line = <$fh>; if ( !defined($line) ) { &Msg("Msg: read undefined line from handle: $fh\n"); $line = ''; } # &Msg("Msg: Just read from handle: $fh: $line\n"); if ( $Wait_Method eq 'alarm' ) { alarm(0); &Msg("Msg: Cleared alarm(0).\n") if $Debug; $SIG{'ALRM'} = 'IGNORE'; $Current_Handle = ''; } return $line; } sub mailto_ok { ############################################################## # Check a mailto URL, but guessing the hostname... ############################################################## # Input is the mailto: URL local($url) = @_; $url =~ s/REMOVE.SPAM-//ig; local($addr, $user, $host); local($tmp, $domain, $guess, @try); local($SMTP, $errs, $greet, $reply, $ok); if ( $url !~ /mailto:(.*)$/ ) { return (0, "REMOTE: NOT A MAILTO! $url"); } elsif ( $url =~ /mailto:(.*)$/ ) { $addr = $1; # Must succeed. } $addr =~ s/\?.*$//; # remove "action": user@host?subject=FooBar ($user, $host) = split(/\@/, $addr, 2); if ( $user eq '' || $host eq '' ) { return (0, "REMOTE: MALFORMED MAILTO: $url"); } local($mxout); $mxout = `mx -v '$addr' 2>/dev/null | egrep '^ACCEPTED|^REJECTED'`; $mxout =~ s/\n/ /g; if ( $mxout =~ /REJECTED/ ) { return (0, "REMOTE: $mxout"); } elsif ( $mxout =~ /ACCEPTED/ ) { return (1, "REMOTE: $mxout"); } else { return (0, "REMOTE: $mxout"); } # not used: push(@try, $host); if ( $host =~ /^[\d+\.]+$/ ) { ; # that's all... } else { foreach $guess (@Mail_Guess) { $tmp = "$guess.$host"; push(@try, $tmp); } if ( $host =~ /(\w+\.\w+)$/ ) { $domain = $1; push(@try, $domain); foreach $guess (@Mail_Guess) { $tmp = "$guess.$domain"; push(@try, $tmp); } } } $errs = ''; $ok = ''; # foreach attempt, connect to SMTP port: foreach $host (@try) { ($SMTP) = &Connect($host, 25); if ( &SocketError($SMTP) ) { # Check for error, set message local($msg) = $SMTP; $msg =~ s/main\'KSOCKETSCONNECT/SOCK/g; $msg =~ s/Ksockets://g; $errs .= "$host, $msg; "; } else { # Connection OK, try to Verify user: &unbuffer($SMTP); chop($greet = <$SMTP>); $greet =~ s/\r//g; if ( $greet =~ /^220/ ) { # Greeting OK:, send VRFY print $SMTP "VRFY $user\n"; chop($reply = <$SMTP>); $reply =~ s/\r//g; if ( $reply =~ /^250/ ) { $ok = "$user\@$host, $reply"; print $SMTP "QUIT\n"; close($SMTP); last; } else { $errs .= "$user\@$host, $reply; " } } else { # Greeting NOT OK: $errs .= "$host, $greet; "; } # Close it: print $SMTP "QUIT\n"; close($SMTP); } } # Return result: if ( $ok ) { return (1, "REMOTE: : $ok"); } else { return (0, "REMOTE: : $errs"); } } sub telnet_ok { ############################################################## # Check a telnet URL to port 23 of a host. ############################################################## # Input is the telnet: URL local($url) = @_; local($addr, $user, $host); local($tmp, $domain, $guess, @try); local($TELNET, $errs, $greet, $reply, $ok); # Double check URL if ( $url !~ m,telnet://(.*)$, ) { return (0, "REMOTE: NOT A TELNET! $url"); } elsif ( $url =~ m,telnet://(.*)$, ) { # Must succeed, extract Host $host = $1; } $host =~ s/\:\d+$//; # Seems port is ignored... if ( $host eq '') { return (0, "REMOTE: MALFORMED TELNET: $url"); } $errs = ''; $greet = ''; $ok = ''; # Connect to the machine: ($TELNET) = &Connect($host, 23); if ( &SocketError($TELNET) ) { # Bad connection local($msg) = $TELNET; $msg =~ s/main\'KSOCKETSCONNECT/SOCK/g; $msg =~ s/Ksockets://g; $errs .= "$host, $msg; "; } else { &unbuffer($TELNET); local($c, $i) = ('', 0); # Connection OK, try to read just 1 character. for ($i = 0; $i < 1; $i++) { $c = getc($TELNET); $greet .= $c; } if ( $greet !~ /^$/ ) { $ok = "$host, $greet"; } else { $errs .= "$host, $greet; " } close($TELNET); } # Return result: if ( $ok ) { return (1, "REMOTE: <$url>: $ok"); } else { return (0, "REMOTE: <$url>: $errs"); } } sub rlogin_ok { ############################################################## # Check a rlogin URL to port 513 of a host. ############################################################## # Todo: does not check user. What is rlogin protocol? # on Unix run rlogin program or something like that... # Input is the rlogin: URL local($url) = @_; local($addr, $user, $host); local($tmp, $domain, $guess, @try); local($RLOGIN, $errs, $greet, $reply, $ok, $null, $term); # Null character (duh) is used to separate fields $null = pack "x"; $term = 'xterm/9600'; # Double check URL if ( $url !~ m,rlogin://(.*)$, ) { return (0, "REMOTE: NOT A RLOGIN! $url"); } elsif ( $url =~ m,rlogin://(.*)$, ) { # Must succeed, extract Host $host = $1; } if ( $host =~ /\@/ ) { ($user, $host) = split(/\@/, $host, 2); } # Whoops, no user, try "guest" if ( $user eq '' ) { $user = 'guest'; } $host =~ s/\:\d+$//; # Seems port is ignored... if ( $host eq '') { return (0, "REMOTE: MALFORMED RLOGIN: $url"); } $greet = ''; $errs = ''; $ok = ''; # Connect to the machine: ($RLOGIN) = &Connect($host, 513); if ( &SocketError($RLOGIN) ) { # Bad connection local($mess) = $RLOGIN; $mess =~ s/main\'KSOCKETSCONNECT/SOCK/g; $mess =~ s/Ksockets://g; $errs .= "$host, $mess; "; } else { &unbuffer($RLOGIN); # Send a somewhat bogus init data: print $RLOGIN $null; print $RLOGIN $user; print $RLOGIN $null; print $RLOGIN $user; print $RLOGIN $null; print $RLOGIN $term; print $RLOGIN $null; local($c, $i) = ('', 0); # Just try to read some characters. local($nchars) = 17; # Probably will get permission denied from high port anyway.... for ($i = 0; $i < $nchars; $i++) { $c = getc($RLOGIN); $greet .= $c; } # So just take any characters as OK. if ( $greet !~ /^$/ ) { $ok = "$host, $greet"; } else { $errs .= "$host, $greet; " } close($RLOGIN); } # Return result: if ( $ok ) { return (1, "REMOTE: <$url>: $ok"); } else { return (0, "REMOTE: <$url>: $errs"); } } sub base_href { ############################################################## # Given a filename, relative to the server root of the HTML # source tree, extract the path of the directory containing it ############################################################## # Input is filename local($file) = @_; local($dir); # Just use the dirname: $dir = &dir_name($file); &Msg("Info: base_href: Before: $file, After: $dir\n") if $Debug; return $dir; } sub base_name { ############################################################## # Extract the base name of a file. ############################################################## # Input is filename local($x) = @_; local($abs) = 0; local($y) = $x; if ( $x =~ m,^/, ) { $abs = 1; # absolute path } $x =~ s,[/]*$,,; # remote leading / $x =~ s,^.*/,,; # and leading up to last / if ( $x eq '' ) { if ( $abs ) { $x = '/'; } else { &Msg("Info: base_name: EMPTY BASENAME: $y\n"); } } return $x; } sub dir_name { ############################################################## # Extract the base name of a file. ############################################################## # Input is filename local($x) = @_; local($abs) = 0; local($y) = $x; if ( $x =~ m,^/, ) { $abs = 1; # absolute path } $x =~ s,/[^/]*$,,; # remove "/stuff/to/end" if ( $x eq '' ) { if ( $abs ) { $x = '/'; # / has dirname / } else { $x = "."; # foo has dirname "." } } return $x; } sub trim_ends { ############################################################## # Remove: # leading and trailing spaces # leading and trailing quotes: " ############################################################## # Input is string to be transformed: local($x) = @_; if ( ! defined($x) ) { &Msg("Info: trim_ends: Trimming undefined variable:\n") if $Debug; $x = ''; } $x =~ s/^\s*//o; $x =~ s/\s*$//o; $x =~ s/^\"//o; $x =~ s/\"$//o; return $x; } sub Log { ############################################################## # Log a string/message to the log file and possibly STDERR # as well. ############################################################## local($fh); if ( !defined($Log) ) { $fh = 'STDERR'; } else { $fh = $Log; } # Unbuffer just to be sure: &unbuffer($fh); # Print to Log handle: print $fh @_; # Print to STDERR too if verbose is set: if ($Verbose) { &Msg(@_); } } sub Msg { ############################################################## # Simple wrapper for printing to a message to stderr ############################################################## local($fh); $fh = 'STDERR'; print $fh @_; } sub help { ############################################################## # Print out the Usage statement using more. ############################################################## if ( -f "/bin/more" ) { open(MORE, "|/bin/more"); print MORE $Usage; close(MORE); } else { print STDOUT $Usage; } } sub create { ############################################################## # Replaces Unix "touch" # NOT USED. ############################################################## local($file) = @_; open(FH, ">$file") || die "cannot create flag file \"$file\", $!"; close(FH); } sub date { ############################################################## # Returns unix style date ############################################################## local($t); $t = localtime; # PERL5 dep return $t } sub unbuffer { ############################################################## # Set a file handle to be unbuffered. ############################################################## local($fh) = @_; local($old_fh) = select($fh); $| = 1; select($old_fh); } sub short_sleep { ############################################################## # Do a sleep with fractional seconds. $time is a float. ############################################################## local($time) = @_; select(undef, undef, undef, $time); } sub do_wrap { ############################################################## # Turn the one line output into indented multi-line. ############################################################## local($HC) = '__HEADER_LOCATION__'; local($DC) = '__DIALOG_LOCATION__'; local($line, $header, $dialog, $url, $file, $desc, $id, $pre, $post); # Loop over the input while (<>) { chop($line = $_); $header = ''; $dialog = ''; # Print File: as is. if ( $line =~ /^File:/ ) { print $line, "\n\n"; next; } # Don't split an Info line < 80 chars. if ( $line =~ /^Info:/ && length($line) < 80 ) { if ( $line !~ /_url_ok/ ) { print $line, "\n\n"; next; } } # Look for [HEADER: .... ] block. if ( $line =~ s/(\[HEADER:[^\]]*\])/$HC/ ) { $header = $1 } # Look for [DIALOG: .... ] block. if ( $line =~ s/(\[DIALOG:[^\]]*\])/$DC/ ) { $dialog = $1 } $url = ''; $file = ''; $desc = ''; # Insert needed | splitter for description: if ( $line =~ /^(Anchor|Image|Form)/ ) { $line =~ s/\:/\: \| /; if ( $line =~ s,\|\s*([^\|]+\s*)$,, ) { $desc = $1; } } # Remove trailing url section if ( $line =~ s,\|\s*([^\|\s]+\s*)$,, ) { $url = $1; } # Remove trailing file section if ( $line =~ s,\|\s*([^\|\s]+\s*)$,,) { $file = $1; } # Trim spaces from end. $line =~ s/\s*$//; # General rule `:' ==> : with newline and tab. $line =~ s/:\s+/:\n\t/g; # Indent the [HEADER: ... ] if any. if ( $header ne '' ) { $header =~ s/;\s+]/]/; $id = "\n\t"; $header =~ s/\[HEADER:\s+/${id}[HEADER:${id}\t/; $id .= "\t"; $header =~ s/\;\s+/$id/g; chop($id); $header =~ s/\]/$id\]/g; $line =~ s/$HC/$header/; } # Indent the [DIALOG: ... ] if any. if ( $dialog ne '' ) { $dialog =~ s/;\s+]/]/; $id = "\n\t"; $dialog =~ s/\[DIALOG:\s+/${id}[DIALOG:${id}\t/; $id .= "\t"; $dialog =~ s/\;\s+/$id/g; chop($id); $dialog =~ s/\]/$id\]/g; $line =~ s/$DC/$dialog/; } # Indent the mailto: if any. if ( $line =~ /\\:/, $line, 2); $pre .= ">:"; $pre =~ s/\ 80 (HTTP) $port = 80; } return ($host, $port, $path, $user); } sub Check_Windows { ############################################################## # Run perl -version to check for Windoze. ############################################################## local($perl_msg); local($perl_cmd) = 'perl'; if ( $ENV{'MY_PERL'} ne '' ) { $perl_cmd = $ENV{'MY_PERL'}; } $perl_msg = `$perl_cmd -version`; if ($perl_msg =~ /win32/i ) { return 1; } else { return 0; } } #---------------------------------------------------------------------- package Ksockets; #=========================================================================== # Ksockets v0.3: Karl's sockets. Copyright (c) 1996-2006 by Karl J. Runge # Socket macros to perl's low level socket interface. #=========================================================================== sub main'Use_Ksockets { ################################################################### # # Initialize Ksockets Package # # USAGE: # # &Use_Ksockets(); # ################################################################### if ( defined($ksockets_initialized) ) { return; } if (0) { # List of functions in this package: &main'Use_Ksockets; &main'Server; &main'Connect; &main'AcceptConnection; &main'Recv; &main'Send; &main'Ready; &main'SocketError; &main'IP_From_Name; &main'SocketInfo; &main'Local_IP; &main'spit_ksockets; &get_sock_type; # List of variables referenced: $main'Debug; } $pkg = 'Ksockets'; $Server_Handles = 'KSOCKETSSERVER00000'; $Connect_Handles = 'KSOCKETSCONNECT0000'; $Ksockets_Handle_String = 'KSOCKETS'; $AF_INET = 2; # choose INET family my $is_svr4 = ''; if ( defined($main'IS_SYSV) ) { if ( $main'IS_SYSV ) { $is_svr4 = 1; } else { $is_svr4 = 0; } } elsif ( defined($main'Is_Sysv) ) { if ( $main'Is_Sysv ) { $is_svr4 = 1; } else { $is_svr4 = 0; } } else { if ( -x "/bin/uname" || -x "/usr/bin/uname" ) { # OK, this is quite a hack... my $osname = `uname -s`; my $release = `uname -r`; if ( $osname =~ /sunos/i && $release !~ /^4\./ ) { $is_svr4 = 1; } # add others ... } } if ( ! $is_svr4 ) { $SOCK_STREAM = 1; # BSD flavor $SOCK_DGRAM = 2; # known to work on Linux # SunOS 4. } else { $SOCK_STREAM = 2; # SVR4 flavor $SOCK_DGRAM = 1; # known to work on solaris } # todo: SOCK_RAW SOCK_SEQPACKET SOCK_RDM $Default_Proto = 'tcp'; $Default_Backlog = '8'; # template for packing socket address into binary. $SockAddr = 'S n a4 x8'; if ( $main'Local ne '' ) { $Hostname = $main'Local; } else { chop($Hostname = `hostname`); } my($name,$aliases,$type,$len,$localaddr) = gethostbyname($Hostname); $Local_IP = join('.', unpack('C4',$localaddr)); $ksockets_initialized = 1; } # end Use_Ksockets sub main'Server { ################################################################### # # Open a listening socket on a port. # # USAGE: # # $file_handle = &Server($port, $backlog[OPT], $proto_in[OPT]) # # Then use $file_handle in &AcceptConnection($file_handle) calls # Test for errors via a call to: &SocketError($file_handle) # # e.g: # $Server = &Server('8888'); # die "$Server" if &SocketError($Server); # ($Client) = &AcceptConnection($Server); # die "$Client" if &SocketError($Client); # ... ################################################################### my($port, $backlog, $proto_in) = @_; my($name, $aliases, $proto); if ( !defined($proto_in) || $proto_in eq '' ) { $proto_in = $Default_Proto; } if ($port !~ /^\d+$/) { # port could be "smtp" though not likely ($name, $aliases, $port, $proto) = getservbyname($port, $proto_in); } ($name, $aliases, $proto) = getprotobyname($proto_in); $backlog = $Default_Backlog unless $backlog; print STDERR "Server:port:$port\n" if $main'Debug; # pack our address my $us = pack($SockAddr, $AF_INET, $port, "\0\0\0\0"); my $listen = ++$Server_Handles; # force filehandles into callers' package my $package = caller; $listen =~ s/^[^']+$/$package'$&/; my $old = select($listen); $| = 1; select($old); my $SOCK_TYPE = &get_sock_type($proto_in); if ( ! socket($listen, $AF_INET, $SOCK_TYPE, $proto) ) { return "Error:$pkg:Server:socket($listen, $AF_INET, $SOCK_TYPE, $proto): \"$!\""; } if ( ! bind($listen, $us) ) { return "Error:$pkg:Server:bind($listen, SOCKADDR): \"$!\""; } if ( $proto_in eq 'tcp' ) { if ( ! listen($listen, $backlog) ) { return "Error:$pkg:Server:listen($listen, $backlog): \"$!\""; } } return $listen; } # end Server sub main'Connect { ################################################################### # # Make a connection to a remote host port. # # USAGE: # # ($file_handle, $host_ip, $Local_IP, $host_sockaddr, $local_sockaddr) = # &Connect($host, $port, $proto_in[OPT]) # # Then use $file_handle in to read and write from remote connection # # Test for errors via a call to: &SocketError($file_handle) # # Use $host_ip and $local_ip as IP address information. # The $host_sockaddr and $local_sockaddr are packed socket addresses # for us in connectionless sockets (e.g. 'udp') # # e.g: # ($Remote) = &Connect('www.netscape.com', '80'); # die "$Remote" if &SocketError($Remote); # ... ################################################################### my($host, $port, $proto_in) = @_; my($name, $aliases, $proto); my($type,$len); my($local_addr, $local_sockaddr); # , $local_ip); my($host_addr, $host_sockaddr, $host_ip); if ( !defined($proto_in) || $proto_in eq '' ) { $proto_in = $Default_Proto; } if ($port !~ /^\d+$/) { # port could be "smtp" ($name, $aliases, $port, $proto) = getservbyname($port, $proto_in); if ( $port eq '' ) { return ("Error:$pkg:Connect:getservbyname($port, $proto_in): Empty port returned", '', '', '', ''); } } ($name, $aliases, $proto) = getprotobyname($proto_in); if ( !defined($proto) || $proto eq '' ) { return ("Error:$pkg:Connect:getprotobyname($proto_in): Empty proto returned", '', '', '', ''); } if ( $host =~ /^\d+/ ) { $host_ip = $host; $host_addr = pack('C4', split(/\./, $host, 4)); } else { ($name,$aliases,$type,$len,$host_addr) = gethostbyname($host); if ( !defined($host_addr) || $host_addr eq '' ) { return ("Error:$pkg:Connect:gethostbyname($host): Empty host_addr returned", '', '', '', ''); } $host_ip = join('.', unpack('C4', $host_addr)); } # $local_addr = pack('C4', split(/\./, $Local_IP, 4)); $local_addr = pack('C4', split(/\./, "0.0.0.0", 4)); $local_sockaddr = pack($SockAddr, $AF_INET, 0, $local_addr); $host_sockaddr = pack($SockAddr, $AF_INET, $port, $host_addr); my $connection = ++$Connect_Handles; # force filehandles into callers' package my $package = caller; $connection =~ s/^[^']+$/$package'$&/; my $old = select($connection); $| = 1; select($old); my $SOCK_TYPE = &get_sock_type($proto_in); if ( ! socket($connection, $AF_INET, $SOCK_TYPE, $proto) ) { return ("Error:$pkg:Connect:socket($connection, $AF_INET, $SOCK_TYPE, $proto): \"$!\"", '', '', '', ''); } if ( ! bind($connection, $local_sockaddr) ) { return ("Error:$pkg:Connect:bind($connection, $Local_IP): \"$!\"", '', '', '', ''); } if ( $proto_in eq 'tcp' ) { if ( ! connect($connection, $host_sockaddr) ) { return ("Error:$pkg:Connect:connect($connection, $host_ip): \"$!\"", '', '', '', ''); } } return ($connection, $host_ip, $Local_IP, $host_sockaddr, $local_sockaddr); } # end Connect sub main'AcceptConnection { ################################################################### # # Wait for and accept a connection on a listening port. # # Listening port created with &Server(); # # USAGE: # # ($file_handle, $address, $client_port) = # &AcceptConnection($listen_file_handle)) # # Then use $file_handle in to read and write from remote connection # Test for errors via a call to: &SocketError($file_handle) # # e.g: # ($OurService) = &Server('8888'); # die "$OurService" if &SocketError($OurService); # ($Guest) = &AcceptConnection($OurService); # die "$Guest" if &SocketError($Guest); # ... ################################################################### my($listen_socket) = @_; my $real_socket = ++$Server_Handles; # force filehandles into callers' package my $package = caller; $real_socket =~ s/^[^']+$/$package'$&/; my $old = select($real_socket); $| = 1; select($old); my $address = ''; my $fail = ''; ($address = accept($real_socket, $listen_socket)) || ($fail = "Error:$pkg:AcceptConnection:accept($real_socket, $listen_socket): \"$!\""); if ( $fail ne '' ) { close($real_socket); return ($fail, '', ''); } my($af,$c_port,$inetaddr) = unpack($SockAddr,$address); $address = join('.', unpack('C4',$inetaddr)); return ($real_socket, $address, $c_port); } # end AcceptConnection sub main'Ready { ################################################################### # # See if a socket handle is ready to be read or written to. # # USAGE: # # ($handle_ready, $time_left ) = # &Ready($socket_handle, $time_out, $mode[OPT]) # # Given handle will be checked for readiness up to $time_out seconds # Default for $mode is 'read', but it could be 'write' or 'error' # # $handle_ready will be the integer result of the select() call, # i.e. the number of 'ready' handles. Should be 0 (problem) or # 1 (handle ready). # # If time_out is undef, it should block waiting for readiness. # # E.g.: # ($ready) = &Ready($socket, '30'); # if ($ready) { # # } else { # # } # ################################################################### my($socket, $time_out, $mode) = @_; my($handle_ready, $time_left, $in, $out) = ('', '', '',''); vec($in, fileno($socket), 1) = 1; # note, the $out = $in stuff is not useful since they are # reset every time. if ( ! defined($mode) || $mode eq 'read' ) { ($handle_ready, $time_left) = select($out = $in, undef, undef, $time_out); } elsif ( $mode eq 'write' ) { ($handle_ready, $time_left) = select(undef, $out = $in, undef, $time_out); } elsif ( $mode eq 'error' ) { ($handle_ready, $time_left) = select(undef, undef, $out = $in, $time_out); } elsif ( $mode eq 'sleep' ) { # undoc'd floating pt sleep ($handle_ready, $time_left) = select(undef, undef, undef, $time_out); } else { # should handle error case better $handle_ready = undef; $time_left = 'Error:Ready:unknown mode'; } return ($handle_ready, $time_left); } # end Ready sub main'Recv { ################################################################### # # Receive data from a socket. # # USAGE: # # ($data, $recv_address_info ) = &Recv($socket_handle, $size) # # ################################################################### my($socket, $size) = @_; my($recv_info, $buf) = ('', ''); $recv_info = recv($socket, $buf, $size, 0); return ($buf, $recv_info); } # end Recv sub main'Send { ################################################################### # # Send data to a socket. # # USAGE: # # $nchars = &Send($socket_handle, $message, $TO[OPT]) # # $nchars is number of characters sent. # optional $TO is for 'udp' sockets. The Remote sockaddr # returned on Connect() say. # # ################################################################### my($socket, $message, $to) = @_; my $send_info = ''; if ( !defined($to) || $to eq '' ) { $send_info = send($socket, $message, 0); } else { $send_info = send($socket, $message, 0, $to); } if ( !defined($send_info) ) { $send_info = "Error:$pkg:Send:send($socket, $message, 0, $to): \"$!\""; } return $send_info; } # end Send sub main'SocketError { ################################################################### # # Test for error on a Ksocket command. # # USAGE: # # ($socket_file_handle, .... ) = &Ksocket_Command(...) # # Ksocket_Command can be: # Server() # Connect() # AcceptConnection() # IP_From_Name() # # $error = &SocketError($socket_file_handle) # # if ( $error ) { # # } else { # # } # ################################################################### my($msg) = @_; if ( $msg =~ /^\s*$/ ) { return 1; } if ( $msg =~ /^\s*ERROR/i ) { return 1; } if ( $msg !~ /$Ksockets_Handle_String/ && $msg !~ /^\d+\.\d+\.\d+\.\d+$/ ) { return 1; } return 0; } # end SocketError sub main'IP_From_Name { ################################################################### # # Translate a machine host name into IP number nnn.nnn.nnn.nnn # # USAGE: # # $ip_number = &IP_From_Name($name) # # e.g. # $ip_number = &IP_From_Name('www.netscape.com'); # die "$ip_number" if &SocketError($ip_number); # ... # ################################################################### my($input_name) = @_; my($remote_address); my($name, $aliases, $type, $len, $ip); if ( $input_name =~ /^\d+\./ ) { return $input_name; } # Turn name -> ddd.ddd.ddd.ddd ($name,$aliases,$type,$len,$remote_address) = gethostbyname($input_name); if ( !defined($remote_address) || $remote_address eq '' ) { return "Error:$pkg:IP_From_Name:gethostbyname($input_name): \"$!\""; } $ip = join(".", unpack('C4',$remote_address)); return $ip; } # end IP_From_Name sub main'SocketInfo { ################################################################### # # Translate a internal address format into human readable information # # USAGE: # # ($r_port, $r_ip, $af) = &SocketInfo($address) # # $address comes from some socket operation, like recv # NOTE: not finished # ################################################################### my($input_address) = @_; my($af, $remote_port, $remote_address) = unpack($SockAddr, $input_address); $remote_address = join('.', unpack('C4', $remote_address)); return ($remote_port, $remote_address, $af) } # end SocketInfo sub get_sock_type { ################################################################### # # INTERNAL ROUTINE not available in main'. # # Translate string protocol name (e.g. tcp or udp) # into Socket type. # # USAGE: # # $SOCK_TYPE = &get_sock_type($name) # # e.g. # $SOCK_TYPE = &get_sock_type('udp'); # ... # ################################################################### my($name) = @_; if ( $name eq 'tcp' || $name eq '' ) { return $SOCK_STREAM; } elsif ( $name eq 'udp' ) { return $SOCK_DGRAM; } } # end get_sock_type sub main'Local_IP { my($ip) = @_; if ( $ip ne '' ) { $Local_IP = $ip; } return $Local_IP; } # end Local_IP sub main'spit_ksockets { open(THIS_FILE, "$0"); my $ok_to_print = ''; my $first = 1; while () { if (0 && $first ) { $first = ''; print STDOUT <<'EOQ'; #!/bin/sh -- # A comment mentioning perl, to prevent perl from looping. Indented to work with bash. eval 'exec perl -S $0 ${1+"$@"}' if 0; EOQ print STDOUT "\n"; print STDOUT "&Use_Ksockets();\n"; print STDOUT "\n"; print STDOUT "#----------------------------------------------------------------------\n"; } if ( /^package Ksockets/ ) { $ok_to_print = 1; } print STDOUT if $ok_to_print; if ( /^[\s\#]*END OF PACKAGE Ksockets/ ) { $ok_to_print = ''; } } } # END OF PACKAGE Ksockets __END__ Notes: HTTP/1.0 200 OK Date: Fri, 21 Nov 1997 03:24:30 GMT Server: Apache/1.1.3 Content-type: text/html Content-length: 1316 Last-modified: Tue, 18 Feb 1997 16:48:40 GMT HTTP/1.0 404 Not found Date: Fri, 21 Nov 1997 03:25:17 GMT Server: Apache/1.1.3 Content-type: text/html @links = ($start_dir); $host = .... $dir = $Retrieve; mkdirpath("$dir/$host"); while (@links) { @new = (); foreach $link (@links) { if ( ! -f "$dir/$host/$link" ) { push(@new, &Get_File_and_Links($host, $link, $dir); } } @links = @new; } sub Retrieve { local($url) = @_; local($host, $port, $path, $user); } sub Get_File_and_Links { }