#!/usr/bin/env perl # #usage: get.pl anyfile #or: get.pl http://anything.anywhere.com/anyfile.html # use LWP::Simple; #provides ability to access files via http, $filename=@ARGV[0]; #get filename from command line if ($filename=~m+http://+) { print "connecting to: $filename\n"; $content=get($filename) or die("no $filename, die"); } else{ print "reading local file: $filename\n"; local(*IN, $/); #see http://perldoc.com/perl5.8.0/pod/perlvar.html for $/ #entire file wll go into a string because of $/ above, otherwise makes array open IN,"<$file"; $content= or die("no $filename, die"); close IN; } #see http://perldoc.com/perl5.8.0/pod/perlrequick.html #The pattern strings are placed within qw# # instead of " or ', #because " and ' are within the patterns $pats{"http"}=qw#http://(.*?)[\"\s\']#; $pats{"relative url"}=qw#href=\s*[\"\'](.*?)[\"\']#; $notpats{"relative url"}=qw#http://|mailto:#; $pats{"email address"}=qw#([\w\.]+\@[\w\.]+)#; #your task: comment out the above and make the following functional: #find url of frames, (check at wwww.ou.edu): #$pats{'frame'}= #find "Bobby noncompliant images, i.e. images without alt tag #check at: # http://soonersports.ocsn.com # http://it.metr.ou.edu/rgraphics/radar # http://weather.ou.edu, etc. #$pats{'bobby violation'}= #$notpats{'bobby violation'}= #find downloadable pdf documents. check at: # http://www.ou.edu/provost/pronew/ # http://weather.ou.edu/grad.htm #$pats{'pdf'}= for $key (sort keys %pats){ $pat=$pats{$key}; $notpat=$notpats{$key}; print "\n$key: looking for $pat "; if ($notpat) {print "without $notpat :\n"} else {print":\n"} $count=0; while ($content=~m#$pat#sg) { #cannot use ~m/$pat/ because / may be within $pat $thematch=$1; if ($notpat){ next if $thematch=~m#$notpat#s; } $count++; print " $count: $thematch\n"; } }