brennen
/
photoFetch


								=pod


								=head1 CONTEXT


								This little gem came to me by way of a craiglist job. I wound up making about a

								hundred bucks for something like 50 hours of work on this project before I

								cut the guy off. At which point he threatened legal action and promised me

								I'd never work in Colorado again.


								This sort of thing is why I don't take random craiglist jobs any more. It is

								also why people are scared of old Perl.


								-- BPB


								=cut


								## $SubjectIndex ='http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';


								$SubjectIndex = 'http://lcweb2.loc.gov/pp/pgzhtml/pgzsubjindex1.html';


								## set this line to the archives you want to download.


								my $curlStatements = "photoCurlStatements.txt";

								my $problemPageList = "photoProblemPages.txt";

								my $imagesDir = 'Photos/';  #name it whatever you like, just make sure

								my $outputFile = "photoData.csv";  #name it whatever you like

								my $progressFile = "photoProgress.txt"; #this tracks how far the script has gone (very coursely)


								my $baseURL = 'http://lcweb2.loc.gov';   # This really shouldn't be changed, script is not very re-usable

								my $lastPlacePage = 28; #the highest numbered place page


								#################################################

								# Programmers guide:

								# This is a non-generalised script for downloading

								# map files and information that can be found at

								# http://lcweb2.loc.gov/ammem/gmdhtml/gmdgeogindex1.html


								# Major Subsections and functions

								# getPlacePage(placePageUrl)

								#   This function fetches the Place Page list and then

								#   goes through it one item at a time passing the urls

								#   to detectPageType


								# detectPageType(unknownPageUrl)

								#   Takes a page URL, and figures out if it is a gallery

								#   page, or a data page. If it is a gallery page it also

								#   attempts to also find if there are more in the series.

								#   it then runs getGalPage for each one in the series, or if

								#   it is a data page, it forks of a getDataPage for it.

								# --------------------


								# getGalPage(targetPageUrl)

								#   This function takes in a gallery page and forks off

								#   a new Process for each link on it. The process then

								#   runs getDataPage


								# getDataPage(dataPageUrl)

								#   This script attempts to parse the page it has reached

								#   clean up the data, and then put it into the CSV file

								#   then it takes the image link it has found and passes

								#   it to getImageFromImagePage


								# getImageFromImagePage(imagePageUrl)

								#   This visits the Image Page Url, and then does the

								#   parsing nessisary to actually find the image files

								#   Url. It then uses exec to launch a copy of curl to

								#   Download the the (often large) file. Because it

								#   uses exec the process dies here.

								#################################################


								#$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?ammem/gmd:@FILREQ(@field(SUBJ+@od1(Bird%27s-eye+view+prints--1860-1870+))+@FIELD(COLLID+citymap))';


								#$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/d?gmd:20:./temp/~ammem_1fen:';


								#getGalPage($targetPage);


								#detectPageType($mysPage);


								# if (open PROGRESSFILE, "<$progressFile"){

								#     my @file = <PROGRESSFILE>;

								#     close PROGRESSFILE;

								#     $start = chomp($file[0]);

								#     print "Progress File says start at $start \n";

								# } else {

								#     print "No Progress File found so starting at the begining. \n";

								#     my $start = 1;

								# }


								#$place = 'http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';

								#getPlacePage($place);


								$photo = 'http://lcweb2.loc.gov/cgi-bin/query/I?ils:1:./temp/~pp_GhEo::displayType=1:m856sd=cph:m856sf=3b28121:@@@';

								#getImageFromImagePage($photo);


								$dataPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Galatasaray+Mekteb-i+Sultanisi--Buildings--1880-1900+))+@FIELD(COLLID+ahii))';

								#getDataPage($dataPage);


								$galleryPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Garden+rooms--Turkey--Istanbul--1880-1900+))+@FIELD(COLLID+ahii))';

								#detectPageType($galleryPage);

								#detectPageType($dataPage);


								$subjectsPage ='http://lcweb2.loc.gov/pp/ahiihtml/ahiiSubjects04.html';

								#getPlacePage($subjectsPage);


								getSubjectPage($SubjectIndex);


								#getCollections();


								#no longer being used

								sub getCollections{

								    print "getCollection \n";

								    my $collectionPageUrl = 'http://lcweb2.loc.gov/pp/pphome.html';

								    $collectionPage = `curl --retry 30  -s \'$collectionPageUrl\'`;

								    $_ = $collectionPage;

								    @collectionLinks = /\<A HREF=\".*?\"/gs;


								    foreach(@collectionLinks) {

									$subjectPageUrl = $_;

									$subjectPageUrl =~ s/\<A HREF=\"(.*?)\"/\1/;

									#print $baseURL . "/pp/" . $subjectPageUrl. " ****\n";


									$subSubUrl = $baseURL . "/pp/" . $subjectPageUrl. "\n";

									$subjectSubpage = `curl --retry 30 -s  \'$subSubUrl\'`;


									if($subjectSubpage =~ /Subject.and.format.headings/s){

								    	  $subjectSubpage =~ s/.*Browse.*?\<A HREF=\"(.*?)\"\>.*/\1/s ;

									  if (length($subjectSubpage) > 150){

									    #print length($subjectSubpage) . " wrong kind of page2\n";

									  }else {

									      $getpage = $baseURL . "/pp/" . $subjectSubpage;

									      print "!!!!!!!!!  " .$getpage . " !!! \n\n\n";

									      getSubjectPage($getpage);

									  }

								      } else {

									  #print length($subjectSubpage) . " wrong kind of page\n";

								      }

								    }


								}


								sub getSubjectPage{

								    print "getSubjectPage \n";

								    my $localSubjectPageUrl = shift;

								    $localSubjectPage = `curl --retry 30  -s \'$localSubjectPageUrl\'`;

								    $_ = $localSubjectPage;

								    @subjectLinks = /From.*?\<a href=\".*?\"\>/gs;


								    $base = $localSubjectPageUrl;

								    $base =~ s/(.*)\/.*?$/\1/s ;


								    foreach(@subjectLinks) {

									$dataPageUrl = $_;

									$dataPageUrl =~ s/From.*\<a href=\"(.*?)\"\>/\1/;

									print $base ."/" .$dataPageUrl. "000 \n";

									getPlacePage( $base ."/" .$dataPageUrl);

								    }

								}


								sub getPlacePage{

								    print "getPlacePage \n";

								    my $localPlacePageUrl = shift;

								    $localPlacePage = `curl --retry 30  -s \'$localPlacePageUrl\'`;

								    $_ = $localPlacePage;

								    @placeLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;


								    foreach(@placeLinks) {

									$dataPageUrl = $_;

									$dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;

									print $dataPageUrl;

									detectPageType($baseURL . $dataPageUrl);

								    }


								}


								sub detectPageType{

								    print "detectPageType \n";

								  my $mysteryPageUrl = shift;


								  #print "FINDING page type for $mysteryPageUrl\n";

								  $mysteryPage = `curl --retry 30  -s \'$mysteryPageUrl\'`;


								  if ($mysteryPage =~ /\<title\>Search Results\<.title\>/s){

								      getGalPage($mysteryPageUrl);

								  } elsif ($mysteryPage =~ /\<title\>\d+?\<.title\>/s){

								      getDataPage($mysteryPageUrl);

								  } else {

								      print "UNKNOWN page type for $mysteryPageUrl \n";

								  }


								}


								sub getGalPage{

								  my $targetPageUrl = shift;


								  print "getGalPage $targetPageUrl \n";

								  $galleryPage = `curl --retry 30 -s \'$targetPageUrl\'`;


								  $_ = $galleryPage;

								  @galLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;


								  foreach(@galLinks) {

								    $dataPageUrl = $_;

								    $dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;

								    #print( "glinks:  ". $baseURL . $dataPageUrl ."\n");

								    getDataPage($baseURL . $dataPageUrl);

								  }


								  $nextPage = $galleryPage;

								  $nextPage =~ s/.*a href\=\"(.*?)\"\>NEXT PAGE.*/\1/s;

								  if (length($nextPage) < 90 and length($nextPage) > 1){

								      getGalPage($baseURL . $nextPage);

								  }


								}


								sub getDataPage{

								    print "getDataPage \n";

								  my $subTarget = shift;

								  $subTarget =~ s/\'/%27/g ;

								  print "--------runing curl --retry 30 -s \'$subTarget\' \n\n";

								  $dataPage = `curl --retry 30  -s \'$subTarget\'`;


								  #make dups to run regex on

								  $imgLink = $dataPage;

								  $title = $dataPage;

								  $created = $dataPage;

								  $notes = $dataPage;

								  $subjects = $dataPage;

								  $names = $dataPage;

								  $medium = $dataPage;

								  $callNumber = $dataPage;

								  $repository = $dataPage;

								  $digitalId = $dataPage;


								  #find the correct section using regex

								  $imgLink =~ s/.*\<A HREF=\"(.*?)\"\>\W?\<IMG SRC.*?\>.*/\1/s;


								  #print $imgLink ."\n";


								  $title =~ s/.*TITLE:(.*?)CALL.*/\1/s;

								  $created =~ s/.*CREATED.PUBLISHED:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;

								  $notes =~ s/.*NOTES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;

								  $subjects =~ s/.*SUBJECTS:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;

								  $names =~ s/.*RELATED.NAMES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;

								  $medium =~ s/.*MEDIUM:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;

								  $callNumber =~ s/.*CALL.NUMBER:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;


								  $repository =~ s/.*REPOSITORY:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;


								  $digitalId =~ s/.*DIGITAL.ID:(.*?)\<P\>.*/\1/s;


								  #strip html

								  $title =~ s/\<[b-zB-Z\/]{1,5}\>|\&nbsp\;|\n|,|\'//g;

								  $created =~ s/\<.{1,5}\>|\n|,|\'|\"//g;

								  $notes =~ s/\<.{1,5}\>|\n|,|\'|\"//g;

								  $subjects =~ s/\<[b-zB-Z\/]{1,5}\>|\n|,|\'//g;

								  $names =~ s/\<.{1,5}\>|\n|,|\'//g;

								  $medium =~ s/\<.{1,5}\>|\n|,|\'//g;

								  $callNumber =~ s/\<.{1,5}\>|\n|,|\'//g;

								  $repository =~ s/\<.{1,5}\>|\n|,|\'//g;

								  $digitalId =~ s/\<.{1,5}\>|\n|,|\'//g;


								  #make links stop being relative

								  $title =~  s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;

								  $created =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;

								  $notes =~  s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;

								  $subjects =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;

								  $names =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;

								  $medium =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;

								  $callNumber =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;

								  $repository =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;

								  $digitalId =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;


								  #check to make sure that we don't paste a whole page into a field

								  if (length($imgLink) > {500}){$imgLink = ''};

								  if (length($title) > 1500){$title = ''};

								  if (length($created) > 1500){$created = ''};

								  if (length($notes ) > 1500){$notes = ''};

								  if (length($subjects ) >1500){$subjects = ''};

								  if (length($names ) > 1500){$names = ''};

								  if (length($medium ) >1500){$medium = ''};

								  if (length($callNumber ) > 1500){$callNumber = ''};

								  if (length($repository ) > 1500){$repository = ''};

								  if (length($digitalId ) > 1500){$digitalId = ''};


								  #specific request for the first date to be appended to file names

								  $firstDate = $created;

								  $firstDate =~ s/.*?(\d\d\d\d).*/\1/ ;

								  if ($firstDate =~ /.{5}/){$firstDate = ''};


								  #$uniqueMarker = $$; #use the pid if we are forking

								  $uniqueMarker = int(rand(9999)); #use the pid if we are forking


								  #set the name the image will be saved under

								  $imgName = $title;

								  $imgName =~ s/\<.*?\>|\W//g;

								  $imgName =~ s/(.{0,23}).*/\1/;

								  $imgName = $uniqueMarker. $imgName . "_" . $firstDate;


								  ##debug only

								#     print  "\n title". $title;

								#     print  "\n created" . $created;

								#     print  "\n notes" . $notes;

								#     print  "\n subjects" . $subjects;

								#     print  "\n names" . $names;

								#     print  "\n medium" . $medium;

								#     print  "\n callNumber" . $callNumber;

								#     print  "\n repository" . $repository;

								#     print  "\n digitalId" . $digitalId;

								#     print  "\n writable img name" . $imgName.$imgType;

								#   print  "\n";


								  if (!$imgLink || length($imgLink) > 100 ){

								    print "No downloadable image\n";

								    #    die( "$$ Appears to be a Gallery Page\n");

								  } else {

								    #fetch the image

								    getImageFromImagePage($baseURL . $imgLink);

								  }

								} # close sub getDataPage


								sub getImageFromImagePage{

								    print "getImageFromImagePage \n";

								    my $imgPageUrl = shift;

								    $imgPageUrl =~ s/\'/%27/g ;

								    #print $imgPageUrl . "\n";

								    $imgPage = `curl --retry 30  -s \'$imgPageUrl\'`;

								    $imgPage =~ s/.*\<a href=\"(.*?)\"\>Retrieve.*/\1/s ;

								    if (length($imgPage) > 200 ){

									print "can't find image Link for $imgName \n";

								    }

								    $imgType = $imgPage;

								    $imgType =~ s/.*(\....)$/\1/;

								  # Lock and write out data to our output file;

								    $lockfile="lock_the_file.loc";

								    while (-e $lockfile) {

								      print "sleep for $$ \n";

								      sleep 2;

								    }

								    open (LOCK,">$lockfile") || die ("Cannot open lock file!\n");

								    close (LOCK);


								    open(DAT,">>$outputFile") || die("Cannot Open $outputFile ");

								    print DAT "\n". $title;

								    print DAT ',' . $created;

								    print DAT ',' . $notes;

								    print DAT ',' . $subjects;

								    print DAT ',' . $names;

								    print DAT ',' . $medium;

								    print DAT ',' . $callNumber;

								    print DAT ',' . $repository;

								    print DAT ',' . $digitalId;

								    print DAT ',' . $imgName.$imgType;

								    close(DAT);


								    my $myCurl = "curl --retry 30 " . $imgPage." -o ".$imagesDir.$imgName.$imgType . "\n";


								    open(CURL,">>$curlStatements") || die("Cannot Open $outputFile ");

								    print CURL $myCurl;

								    unlink($lockfile);


								    print "\nRUNNING " . $myCurl ;

								    system($myCurl);

								} # close sub getImageFromImagePage