=pod =head1 CONTEXT This little gem came to me by way of a craiglist job. I wound up making about a hundred bucks for something like 50 hours of work on this project before I cut the guy off. At which point he threatened legal action and promised me I'd never work in Colorado again. This sort of thing is why I don't take random craiglist jobs any more. It is also why people are scared of old Perl. -- BPB =cut ## $SubjectIndex ='http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html'; $SubjectIndex = 'http://lcweb2.loc.gov/pp/pgzhtml/pgzsubjindex1.html'; ## set this line to the archives you want to download. my $curlStatements = "photoCurlStatements.txt"; my $problemPageList = "photoProblemPages.txt"; my $imagesDir = 'Photos/'; #name it whatever you like, just make sure my $outputFile = "photoData.csv"; #name it whatever you like my $progressFile = "photoProgress.txt"; #this tracks how far the script has gone (very coursely) my $baseURL = 'http://lcweb2.loc.gov'; # This really shouldn't be changed, script is not very re-usable my $lastPlacePage = 28; #the highest numbered place page ################################################# # Programmers guide: # This is a non-generalised script for downloading # map files and information that can be found at # http://lcweb2.loc.gov/ammem/gmdhtml/gmdgeogindex1.html # Major Subsections and functions # getPlacePage(placePageUrl) # This function fetches the Place Page list and then # goes through it one item at a time passing the urls # to detectPageType # detectPageType(unknownPageUrl) # Takes a page URL, and figures out if it is a gallery # page, or a data page. If it is a gallery page it also # attempts to also find if there are more in the series. # it then runs getGalPage for each one in the series, or if # it is a data page, it forks of a getDataPage for it. # -------------------- # getGalPage(targetPageUrl) # This function takes in a gallery page and forks off # a new Process for each link on it. The process then # runs getDataPage # getDataPage(dataPageUrl) # This script attempts to parse the page it has reached # clean up the data, and then put it into the CSV file # then it takes the image link it has found and passes # it to getImageFromImagePage # getImageFromImagePage(imagePageUrl) # This visits the Image Page Url, and then does the # parsing nessisary to actually find the image files # Url. It then uses exec to launch a copy of curl to # Download the the (often large) file. Because it # uses exec the process dies here. ################################################# #$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?ammem/gmd:@FILREQ(@field(SUBJ+@od1(Bird%27s-eye+view+prints--1860-1870+))+@FIELD(COLLID+citymap))'; #$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/d?gmd:20:./temp/~ammem_1fen:'; #getGalPage($targetPage); #detectPageType($mysPage); # if (open PROGRESSFILE, "<$progressFile"){ # my @file = ; # close PROGRESSFILE; # $start = chomp($file[0]); # print "Progress File says start at $start \n"; # } else { # print "No Progress File found so starting at the begining. \n"; # my $start = 1; # } #$place = 'http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html'; #getPlacePage($place); $photo = 'http://lcweb2.loc.gov/cgi-bin/query/I?ils:1:./temp/~pp_GhEo::displayType=1:m856sd=cph:m856sf=3b28121:@@@'; #getImageFromImagePage($photo); $dataPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Galatasaray+Mekteb-i+Sultanisi--Buildings--1880-1900+))+@FIELD(COLLID+ahii))'; #getDataPage($dataPage); $galleryPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Garden+rooms--Turkey--Istanbul--1880-1900+))+@FIELD(COLLID+ahii))'; #detectPageType($galleryPage); #detectPageType($dataPage); $subjectsPage ='http://lcweb2.loc.gov/pp/ahiihtml/ahiiSubjects04.html'; #getPlacePage($subjectsPage); getSubjectPage($SubjectIndex); #getCollections(); #no longer being used sub getCollections{ print "getCollection \n"; my $collectionPageUrl = 'http://lcweb2.loc.gov/pp/pphome.html'; $collectionPage = `curl --retry 30 -s \'$collectionPageUrl\'`; $_ = $collectionPage; @collectionLinks = /\.*/\1/s ; if (length($subjectSubpage) > 150){ #print length($subjectSubpage) . " wrong kind of page2\n"; }else { $getpage = $baseURL . "/pp/" . $subjectSubpage; print "!!!!!!!!! " .$getpage . " !!! \n\n\n"; getSubjectPage($getpage); } } else { #print length($subjectSubpage) . " wrong kind of page\n"; } } } sub getSubjectPage{ print "getSubjectPage \n"; my $localSubjectPageUrl = shift; $localSubjectPage = `curl --retry 30 -s \'$localSubjectPageUrl\'`; $_ = $localSubjectPage; @subjectLinks = /From.*?\/gs; $base = $localSubjectPageUrl; $base =~ s/(.*)\/.*?$/\1/s ; foreach(@subjectLinks) { $dataPageUrl = $_; $dataPageUrl =~ s/From.*\/\1/; print $base ."/" .$dataPageUrl. "000 \n"; getPlacePage( $base ."/" .$dataPageUrl); } } sub getPlacePage{ print "getPlacePage \n"; my $localPlacePageUrl = shift; $localPlacePage = `curl --retry 30 -s \'$localPlacePageUrl\'`; $_ = $localPlacePage; @placeLinks = /\/gs; foreach(@placeLinks) { $dataPageUrl = $_; $dataPageUrl =~ s/\/\1/; print $dataPageUrl; detectPageType($baseURL . $dataPageUrl); } } sub detectPageType{ print "detectPageType \n"; my $mysteryPageUrl = shift; #print "FINDING page type for $mysteryPageUrl\n"; $mysteryPage = `curl --retry 30 -s \'$mysteryPageUrl\'`; if ($mysteryPage =~ /\Search Results\<.title\>/s){ getGalPage($mysteryPageUrl); } elsif ($mysteryPage =~ /\\d+?\<.title\>/s){ getDataPage($mysteryPageUrl); } else { print "UNKNOWN page type for $mysteryPageUrl \n"; } } sub getGalPage{ my $targetPageUrl = shift; print "getGalPage $targetPageUrl \n"; $galleryPage = `curl --retry 30 -s \'$targetPageUrl\'`; $_ = $galleryPage; @galLinks = /\/gs; foreach(@galLinks) { $dataPageUrl = $_; $dataPageUrl =~ s/\/\1/; #print( "glinks: ". $baseURL . $dataPageUrl ."\n"); getDataPage($baseURL . $dataPageUrl); } $nextPage = $galleryPage; $nextPage =~ s/.*a href\=\"(.*?)\"\>NEXT PAGE.*/\1/s; if (length($nextPage) < 90 and length($nextPage) > 1){ getGalPage($baseURL . $nextPage); } } sub getDataPage{ print "getDataPage \n"; my $subTarget = shift; $subTarget =~ s/\'/%27/g ; print "--------runing curl --retry 30 -s \'$subTarget\' \n\n"; $dataPage = `curl --retry 30 -s \'$subTarget\'`; #make dups to run regex on $imgLink = $dataPage; $title = $dataPage; $created = $dataPage; $notes = $dataPage; $subjects = $dataPage; $names = $dataPage; $medium = $dataPage; $callNumber = $dataPage; $repository = $dataPage; $digitalId = $dataPage; #find the correct section using regex $imgLink =~ s/.*\\W?\.*/\1/s; #print $imgLink ."\n"; $title =~ s/.*TITLE:(.*?)CALL.*/\1/s; $created =~ s/.*CREATED.PUBLISHED:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s; $notes =~ s/.*NOTES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s; $subjects =~ s/.*SUBJECTS:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s; $names =~ s/.*RELATED.NAMES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s; $medium =~ s/.*MEDIUM:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s; $callNumber =~ s/.*CALL.NUMBER:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s; $repository =~ s/.*REPOSITORY:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s; $digitalId =~ s/.*DIGITAL.ID:(.*?)\.*/\1/s; #strip html $title =~ s/\<[b-zB-Z\/]{1,5}\>|\ \;|\n|,|\'//g; $created =~ s/\<.{1,5}\>|\n|,|\'|\"//g; $notes =~ s/\<.{1,5}\>|\n|,|\'|\"//g; $subjects =~ s/\<[b-zB-Z\/]{1,5}\>|\n|,|\'//g; $names =~ s/\<.{1,5}\>|\n|,|\'//g; $medium =~ s/\<.{1,5}\>|\n|,|\'//g; $callNumber =~ s/\<.{1,5}\>|\n|,|\'//g; $repository =~ s/\<.{1,5}\>|\n|,|\'//g; $digitalId =~ s/\<.{1,5}\>|\n|,|\'//g; #make links stop being relative $title =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g; $created =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g; $notes =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g; $subjects =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g; $names =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g; $medium =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g; $callNumber =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g; $repository =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g; $digitalId =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g; #check to make sure that we don't paste a whole page into a field if (length($imgLink) > {500}){$imgLink = ''}; if (length($title) > 1500){$title = ''}; if (length($created) > 1500){$created = ''}; if (length($notes ) > 1500){$notes = ''}; if (length($subjects ) >1500){$subjects = ''}; if (length($names ) > 1500){$names = ''}; if (length($medium ) >1500){$medium = ''}; if (length($callNumber ) > 1500){$callNumber = ''}; if (length($repository ) > 1500){$repository = ''}; if (length($digitalId ) > 1500){$digitalId = ''}; #specific request for the first date to be appended to file names $firstDate = $created; $firstDate =~ s/.*?(\d\d\d\d).*/\1/ ; if ($firstDate =~ /.{5}/){$firstDate = ''}; #$uniqueMarker = $$; #use the pid if we are forking $uniqueMarker = int(rand(9999)); #use the pid if we are forking #set the name the image will be saved under $imgName = $title; $imgName =~ s/\<.*?\>|\W//g; $imgName =~ s/(.{0,23}).*/\1/; $imgName = $uniqueMarker. $imgName . "_" . $firstDate; ##debug only # print "\n title". $title; # print "\n created" . $created; # print "\n notes" . $notes; # print "\n subjects" . $subjects; # print "\n names" . $names; # print "\n medium" . $medium; # print "\n callNumber" . $callNumber; # print "\n repository" . $repository; # print "\n digitalId" . $digitalId; # print "\n writable img name" . $imgName.$imgType; # print "\n"; if (!$imgLink || length($imgLink) > 100 ){ print "No downloadable image\n"; # die( "$$ Appears to be a Gallery Page\n"); } else { #fetch the image getImageFromImagePage($baseURL . $imgLink); } } # close sub getDataPage sub getImageFromImagePage{ print "getImageFromImagePage \n"; my $imgPageUrl = shift; $imgPageUrl =~ s/\'/%27/g ; #print $imgPageUrl . "\n"; $imgPage = `curl --retry 30 -s \'$imgPageUrl\'`; $imgPage =~ s/.*\Retrieve.*/\1/s ; if (length($imgPage) > 200 ){ print "can't find image Link for $imgName \n"; } $imgType = $imgPage; $imgType =~ s/.*(\....)$/\1/; # Lock and write out data to our output file; $lockfile="lock_the_file.loc"; while (-e $lockfile) { print "sleep for $$ \n"; sleep 2; } open (LOCK,">$lockfile") || die ("Cannot open lock file!\n"); close (LOCK); open(DAT,">>$outputFile") || die("Cannot Open $outputFile "); print DAT "\n". $title; print DAT ',' . $created; print DAT ',' . $notes; print DAT ',' . $subjects; print DAT ',' . $names; print DAT ',' . $medium; print DAT ',' . $callNumber; print DAT ',' . $repository; print DAT ',' . $digitalId; print DAT ',' . $imgName.$imgType; close(DAT); my $myCurl = "curl --retry 30 " . $imgPage." -o ".$imagesDir.$imgName.$imgType . "\n"; open(CURL,">>$curlStatements") || die("Cannot Open $outputFile "); print CURL $myCurl; unlink($lockfile); print "\nRUNNING " . $myCurl ; system($myCurl); } # close sub getImageFromImagePage