brennen
/
photoFetch

=pod
=head1 CONTEXT
This little gem came to me by way of a craiglist job. I wound up making about ahundred bucks for something like 50 hours of work on this project before Icut the guy off. At which point he threatened legal action and promised meI'd never work in Colorado again.
This sort of thing is why I don't take random craiglist jobs any more. It isalso why people are scared of old Perl.
-- BPB
=cut

## $SubjectIndex ='http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';
$SubjectIndex = 'http://lcweb2.loc.gov/pp/pgzhtml/pgzsubjindex1.html';
## set this line to the archives you want to download.

my $curlStatements = "photoCurlStatements.txt";my $problemPageList = "photoProblemPages.txt";my $imagesDir = 'Photos/';  #name it whatever you like, just make sure my $outputFile = "photoData.csv";  #name it whatever you like my $progressFile = "photoProgress.txt"; #this tracks how far the script has gone (very coursely)
my $baseURL = 'http://lcweb2.loc.gov';   # This really shouldn't be changed, script is not very re-usablemy $lastPlacePage = 28; #the highest numbered place page
################################################## Programmers guide:# This is a non-generalised script for downloading# map files and information that can be found at# http://lcweb2.loc.gov/ammem/gmdhtml/gmdgeogindex1.html
# Major Subsections and functions# getPlacePage(placePageUrl)#   This function fetches the Place Page list and then#   goes through it one item at a time passing the urls#   to detectPageType
# detectPageType(unknownPageUrl)#   Takes a page URL, and figures out if it is a gallery#   page, or a data page. If it is a gallery page it also#   attempts to also find if there are more in the series.#   it then runs getGalPage for each one in the series, or if#   it is a data page, it forks of a getDataPage for it.# --------------------
# getGalPage(targetPageUrl)#   This function takes in a gallery page and forks off#   a new Process for each link on it. The process then#   runs getDataPage
# getDataPage(dataPageUrl)#   This script attempts to parse the page it has reached#   clean up the data, and then put it into the CSV file#   then it takes the image link it has found and passes#   it to getImageFromImagePage
# getImageFromImagePage(imagePageUrl)#   This visits the Image Page Url, and then does the#   parsing nessisary to actually find the image files#   Url. It then uses exec to launch a copy of curl to#   Download the the (often large) file. Because it#   uses exec the process dies here. #################################################

#$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?ammem/gmd:@FILREQ(@field(SUBJ+@od1(Bird%27s-eye+view+prints--1860-1870+))+@FIELD(COLLID+citymap))';
#$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/d?gmd:20:./temp/~ammem_1fen:';
#getGalPage($targetPage);
#detectPageType($mysPage);
# if (open PROGRESSFILE, "<$progressFile"){#     my @file = <PROGRESSFILE>;#     close PROGRESSFILE;#     $start = chomp($file[0]);#     print "Progress File says start at $start \n";# } else {#     print "No Progress File found so starting at the begining. \n";#     my $start = 1;# }
#$place = 'http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';#getPlacePage($place);
$photo = 'http://lcweb2.loc.gov/cgi-bin/query/I?ils:1:./temp/~pp_GhEo::displayType=1:m856sd=cph:m856sf=3b28121:@@@';#getImageFromImagePage($photo);
$dataPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Galatasaray+Mekteb-i+Sultanisi--Buildings--1880-1900+))+@FIELD(COLLID+ahii))';#getDataPage($dataPage);
$galleryPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Garden+rooms--Turkey--Istanbul--1880-1900+))+@FIELD(COLLID+ahii))';#detectPageType($galleryPage);#detectPageType($dataPage);
$subjectsPage ='http://lcweb2.loc.gov/pp/ahiihtml/ahiiSubjects04.html';#getPlacePage($subjectsPage);

getSubjectPage($SubjectIndex);
#getCollections();

#no longer being usedsub getCollections{    print "getCollection \n";    my $collectionPageUrl = 'http://lcweb2.loc.gov/pp/pphome.html';    $collectionPage = `curl --retry 30  -s \'$collectionPageUrl\'`;    $_ = $collectionPage;    @collectionLinks = /\<A HREF=\".*?\"/gs;
    foreach(@collectionLinks) {	$subjectPageUrl = $_;	$subjectPageUrl =~ s/\<A HREF=\"(.*?)\"/\1/;	#print $baseURL . "/pp/" . $subjectPageUrl. " ****\n";
	$subSubUrl = $baseURL . "/pp/" . $subjectPageUrl. "\n";	$subjectSubpage = `curl --retry 30 -s  \'$subSubUrl\'`;		if($subjectSubpage =~ /Subject.and.format.headings/s){    	  $subjectSubpage =~ s/.*Browse.*?\<A HREF=\"(.*?)\"\>.*/\1/s ;	  if (length($subjectSubpage) > 150){	    #print length($subjectSubpage) . " wrong kind of page2\n";	  }else {	      $getpage = $baseURL . "/pp/" . $subjectSubpage;	      print "!!!!!!!!!  " .$getpage . " !!! \n\n\n";	 	      getSubjectPage($getpage);	  }	      } else {     	  #print length($subjectSubpage) . " wrong kind of page\n";      }    }
}


sub getSubjectPage{    print "getSubjectPage \n";    my $localSubjectPageUrl = shift;    $localSubjectPage = `curl --retry 30  -s \'$localSubjectPageUrl\'`;    $_ = $localSubjectPage;    @subjectLinks = /From.*?\<a href=\".*?\"\>/gs;
    $base = $localSubjectPageUrl;    $base =~ s/(.*)\/.*?$/\1/s ;
    foreach(@subjectLinks) {	$dataPageUrl = $_;	$dataPageUrl =~ s/From.*\<a href=\"(.*?)\"\>/\1/;	print $base ."/" .$dataPageUrl. "000 \n";	getPlacePage( $base ."/" .$dataPageUrl);    }}

sub getPlacePage{    print "getPlacePage \n";    my $localPlacePageUrl = shift;    $localPlacePage = `curl --retry 30  -s \'$localPlacePageUrl\'`;    $_ = $localPlacePage;    @placeLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;
    foreach(@placeLinks) {	$dataPageUrl = $_;	$dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;	print $dataPageUrl;	detectPageType($baseURL . $dataPageUrl);    }
}
sub detectPageType{    print "detectPageType \n";  my $mysteryPageUrl = shift; 
  #print "FINDING page type for $mysteryPageUrl\n";  $mysteryPage = `curl --retry 30  -s \'$mysteryPageUrl\'`;
  if ($mysteryPage =~ /\<title\>Search Results\<.title\>/s){      getGalPage($mysteryPageUrl);  } elsif ($mysteryPage =~ /\<title\>\d+?\<.title\>/s){      getDataPage($mysteryPageUrl);  } else {      print "UNKNOWN page type for $mysteryPageUrl \n";  }
}
sub getGalPage{  my $targetPageUrl = shift; 
  print "getGalPage $targetPageUrl \n";  $galleryPage = `curl --retry 30 -s \'$targetPageUrl\'`;    $_ = $galleryPage;  @galLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;
  foreach(@galLinks) {    $dataPageUrl = $_;    $dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;    #print( "glinks:  ". $baseURL . $dataPageUrl ."\n");    getDataPage($baseURL . $dataPageUrl);  }
  $nextPage = $galleryPage;  $nextPage =~ s/.*a href\=\"(.*?)\"\>NEXT PAGE.*/\1/s;  if (length($nextPage) < 90 and length($nextPage) > 1){      getGalPage($baseURL . $nextPage);  }

}
sub getDataPage{    print "getDataPage \n";  my $subTarget = shift;   $subTarget =~ s/\'/%27/g ;  print "--------runing curl --retry 30 -s \'$subTarget\' \n\n";  $dataPage = `curl --retry 30  -s \'$subTarget\'`;    #make dups to run regex on  $imgLink = $dataPage;  $title = $dataPage;  $created = $dataPage;  $notes = $dataPage;  $subjects = $dataPage;  $names = $dataPage;  $medium = $dataPage;  $callNumber = $dataPage;  $repository = $dataPage;  $digitalId = $dataPage;    #find the correct section using regex  $imgLink =~ s/.*\<A HREF=\"(.*?)\"\>\W?\<IMG SRC.*?\>.*/\1/s;
  #print $imgLink ."\n";
  $title =~ s/.*TITLE:(.*?)CALL.*/\1/s;  $created =~ s/.*CREATED.PUBLISHED:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;  $notes =~ s/.*NOTES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;  $subjects =~ s/.*SUBJECTS:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;  $names =~ s/.*RELATED.NAMES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;  $medium =~ s/.*MEDIUM:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;  $callNumber =~ s/.*CALL.NUMBER:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;    $repository =~ s/.*REPOSITORY:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;    $digitalId =~ s/.*DIGITAL.ID:(.*?)\<P\>.*/\1/s;    #strip html  $title =~ s/\<[b-zB-Z\/]{1,5}\>|\&nbsp\;|\n|,|\'//g;  $created =~ s/\<.{1,5}\>|\n|,|\'|\"//g;  $notes =~ s/\<.{1,5}\>|\n|,|\'|\"//g;  $subjects =~ s/\<[b-zB-Z\/]{1,5}\>|\n|,|\'//g;  $names =~ s/\<.{1,5}\>|\n|,|\'//g;  $medium =~ s/\<.{1,5}\>|\n|,|\'//g;  $callNumber =~ s/\<.{1,5}\>|\n|,|\'//g;  $repository =~ s/\<.{1,5}\>|\n|,|\'//g;  $digitalId =~ s/\<.{1,5}\>|\n|,|\'//g;
  #make links stop being relative  $title =~  s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;  $created =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;  $notes =~  s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;  $subjects =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;  $names =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;  $medium =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;  $callNumber =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;  $repository =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;  $digitalId =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
    #check to make sure that we don't paste a whole page into a field  if (length($imgLink) > {500}){$imgLink = ''};  if (length($title) > 1500){$title = ''};  if (length($created) > 1500){$created = ''};  if (length($notes ) > 1500){$notes = ''};  if (length($subjects ) >1500){$subjects = ''};  if (length($names ) > 1500){$names = ''};  if (length($medium ) >1500){$medium = ''};  if (length($callNumber ) > 1500){$callNumber = ''};  if (length($repository ) > 1500){$repository = ''};  if (length($digitalId ) > 1500){$digitalId = ''};    #specific request for the first date to be appended to file names  $firstDate = $created;  $firstDate =~ s/.*?(\d\d\d\d).*/\1/ ;  if ($firstDate =~ /.{5}/){$firstDate = ''};
  #$uniqueMarker = $$; #use the pid if we are forking  $uniqueMarker = int(rand(9999)); #use the pid if we are forking
  #set the name the image will be saved under  $imgName = $title;  $imgName =~ s/\<.*?\>|\W//g;  $imgName =~ s/(.{0,23}).*/\1/;  $imgName = $uniqueMarker. $imgName . "_" . $firstDate;

  ##debug only#     print  "\n title". $title;#     print  "\n created" . $created;#     print  "\n notes" . $notes;#     print  "\n subjects" . $subjects;#     print  "\n names" . $names;#     print  "\n medium" . $medium;#     print  "\n callNumber" . $callNumber;#     print  "\n repository" . $repository;#     print  "\n digitalId" . $digitalId;#     print  "\n writable img name" . $imgName.$imgType;#   print  "\n";


  if (!$imgLink || length($imgLink) > 100 ){    print "No downloadable image\n";    #    die( "$$ Appears to be a Gallery Page\n");  } else {    #fetch the image    getImageFromImagePage($baseURL . $imgLink);  }} # close sub getDataPage
sub getImageFromImagePage{    print "getImageFromImagePage \n";    my $imgPageUrl = shift;    $imgPageUrl =~ s/\'/%27/g ;    #print $imgPageUrl . "\n";    $imgPage = `curl --retry 30  -s \'$imgPageUrl\'`;    $imgPage =~ s/.*\<a href=\"(.*?)\"\>Retrieve.*/\1/s ;     if (length($imgPage) > 200 ){	print "can't find image Link for $imgName \n";    }    $imgType = $imgPage;    $imgType =~ s/.*(\....)$/\1/;  # Lock and write out data to our output file;    $lockfile="lock_the_file.loc";    while (-e $lockfile) {      print "sleep for $$ \n";      sleep 2;    }    open (LOCK,">$lockfile") || die ("Cannot open lock file!\n");    close (LOCK);      open(DAT,">>$outputFile") || die("Cannot Open $outputFile ");    print DAT "\n". $title;    print DAT ',' . $created;    print DAT ',' . $notes;    print DAT ',' . $subjects;    print DAT ',' . $names;    print DAT ',' . $medium;    print DAT ',' . $callNumber;    print DAT ',' . $repository;    print DAT ',' . $digitalId;    print DAT ',' . $imgName.$imgType;    close(DAT);      my $myCurl = "curl --retry 30 " . $imgPage." -o ".$imagesDir.$imgName.$imgType . "\n";
    open(CURL,">>$curlStatements") || die("Cannot Open $outputFile ");    print CURL $myCurl;    unlink($lockfile);
    print "\nRUNNING " . $myCurl ;    system($myCurl);} # close sub getImageFromImagePage