=pod

=head1 CONTEXT

This little gem came to me by way of a craiglist job. I wound up making about a
hundred bucks for something like 50 hours of work on this project before I
cut the guy off. At which point he threatened legal action and promised me
I'd never work in Colorado again.

This sort of thing is why I don't take random craiglist jobs any more. It is
also why people are scared of old Perl.

-- BPB

=cut


## $SubjectIndex ='http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';

$SubjectIndex = 'http://lcweb2.loc.gov/pp/pgzhtml/pgzsubjindex1.html';

## set this line to the archives you want to download.


my $curlStatements = "photoCurlStatements.txt";
my $problemPageList = "photoProblemPages.txt";
my $imagesDir = 'Photos/';  #name it whatever you like, just make sure 
my $outputFile = "photoData.csv";  #name it whatever you like 
my $progressFile = "photoProgress.txt"; #this tracks how far the script has gone (very coursely)

my $baseURL = 'http://lcweb2.loc.gov';   # This really shouldn't be changed, script is not very re-usable
my $lastPlacePage = 28; #the highest numbered place page

#################################################
# Programmers guide:
# This is a non-generalised script for downloading
# map files and information that can be found at
# http://lcweb2.loc.gov/ammem/gmdhtml/gmdgeogindex1.html

# Major Subsections and functions
# getPlacePage(placePageUrl)
#   This function fetches the Place Page list and then
#   goes through it one item at a time passing the urls
#   to detectPageType

# detectPageType(unknownPageUrl)
#   Takes a page URL, and figures out if it is a gallery
#   page, or a data page. If it is a gallery page it also
#   attempts to also find if there are more in the series.
#   it then runs getGalPage for each one in the series, or if
#   it is a data page, it forks of a getDataPage for it.
# --------------------

# getGalPage(targetPageUrl)
#   This function takes in a gallery page and forks off
#   a new Process for each link on it. The process then
#   runs getDataPage

# getDataPage(dataPageUrl)
#   This script attempts to parse the page it has reached
#   clean up the data, and then put it into the CSV file
#   then it takes the image link it has found and passes
#   it to getImageFromImagePage

# getImageFromImagePage(imagePageUrl)
#   This visits the Image Page Url, and then does the
#   parsing nessisary to actually find the image files
#   Url. It then uses exec to launch a copy of curl to
#   Download the the (often large) file. Because it
#   uses exec the process dies here. 
#################################################


#$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?ammem/gmd:@FILREQ(@field(SUBJ+@od1(Bird%27s-eye+view+prints--1860-1870+))+@FIELD(COLLID+citymap))';

#$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/d?gmd:20:./temp/~ammem_1fen:';

#getGalPage($targetPage);

#detectPageType($mysPage);

# if (open PROGRESSFILE, "<$progressFile"){
#     my @file = <PROGRESSFILE>;
#     close PROGRESSFILE;
#     $start = chomp($file[0]);
#     print "Progress File says start at $start \n";
# } else {
#     print "No Progress File found so starting at the begining. \n";
#     my $start = 1;
# }

#$place = 'http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';
#getPlacePage($place);

$photo = 'http://lcweb2.loc.gov/cgi-bin/query/I?ils:1:./temp/~pp_GhEo::displayType=1:m856sd=cph:m856sf=3b28121:@@@';
#getImageFromImagePage($photo);

$dataPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Galatasaray+Mekteb-i+Sultanisi--Buildings--1880-1900+))+@FIELD(COLLID+ahii))';
#getDataPage($dataPage);

$galleryPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Garden+rooms--Turkey--Istanbul--1880-1900+))+@FIELD(COLLID+ahii))';
#detectPageType($galleryPage);
#detectPageType($dataPage);

$subjectsPage ='http://lcweb2.loc.gov/pp/ahiihtml/ahiiSubjects04.html';
#getPlacePage($subjectsPage);


getSubjectPage($SubjectIndex);

#getCollections();


#no longer being used
sub getCollections{
    print "getCollection \n";
    my $collectionPageUrl = 'http://lcweb2.loc.gov/pp/pphome.html';
    $collectionPage = `curl --retry 30  -s \'$collectionPageUrl\'`;
    $_ = $collectionPage;
    @collectionLinks = /\<A HREF=\".*?\"/gs;

    foreach(@collectionLinks) {
	$subjectPageUrl = $_;
	$subjectPageUrl =~ s/\<A HREF=\"(.*?)\"/\1/;
	#print $baseURL . "/pp/" . $subjectPageUrl. " ****\n";

	$subSubUrl = $baseURL . "/pp/" . $subjectPageUrl. "\n";
	$subjectSubpage = `curl --retry 30 -s  \'$subSubUrl\'`;
	
	if($subjectSubpage =~ /Subject.and.format.headings/s){
    	  $subjectSubpage =~ s/.*Browse.*?\<A HREF=\"(.*?)\"\>.*/\1/s ;
	  if (length($subjectSubpage) > 150){
	    #print length($subjectSubpage) . " wrong kind of page2\n";
	  }else {
	      $getpage = $baseURL . "/pp/" . $subjectSubpage;
	      print "!!!!!!!!!  " .$getpage . " !!! \n\n\n";	 
	      getSubjectPage($getpage);
	  }	
      } else {     
	  #print length($subjectSubpage) . " wrong kind of page\n";
      }
    }

}


sub getSubjectPage{
    print "getSubjectPage \n";
    my $localSubjectPageUrl = shift;
    $localSubjectPage = `curl --retry 30  -s \'$localSubjectPageUrl\'`;
    $_ = $localSubjectPage;
    @subjectLinks = /From.*?\<a href=\".*?\"\>/gs;

    $base = $localSubjectPageUrl;
    $base =~ s/(.*)\/.*?$/\1/s ;

    foreach(@subjectLinks) {
	$dataPageUrl = $_;
	$dataPageUrl =~ s/From.*\<a href=\"(.*?)\"\>/\1/;
	print $base ."/" .$dataPageUrl. "000 \n";
	getPlacePage( $base ."/" .$dataPageUrl);
    }
}


sub getPlacePage{
    print "getPlacePage \n";
    my $localPlacePageUrl = shift;
    $localPlacePage = `curl --retry 30  -s \'$localPlacePageUrl\'`;
    $_ = $localPlacePage;
    @placeLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;

    foreach(@placeLinks) {
	$dataPageUrl = $_;
	$dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;
	print $dataPageUrl;
	detectPageType($baseURL . $dataPageUrl);
    }

}

sub detectPageType{
    print "detectPageType \n";
  my $mysteryPageUrl = shift; 

  #print "FINDING page type for $mysteryPageUrl\n";
  $mysteryPage = `curl --retry 30  -s \'$mysteryPageUrl\'`;

  if ($mysteryPage =~ /\<title\>Search Results\<.title\>/s){
      getGalPage($mysteryPageUrl);
  } elsif ($mysteryPage =~ /\<title\>\d+?\<.title\>/s){
      getDataPage($mysteryPageUrl);
  } else {
      print "UNKNOWN page type for $mysteryPageUrl \n";
  }

}

sub getGalPage{
  my $targetPageUrl = shift; 

  print "getGalPage $targetPageUrl \n";
  $galleryPage = `curl --retry 30 -s \'$targetPageUrl\'`;
  
  $_ = $galleryPage;
  @galLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;

  foreach(@galLinks) {
    $dataPageUrl = $_;
    $dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;
    #print( "glinks:  ". $baseURL . $dataPageUrl ."\n");
    getDataPage($baseURL . $dataPageUrl);
  }

  $nextPage = $galleryPage;
  $nextPage =~ s/.*a href\=\"(.*?)\"\>NEXT PAGE.*/\1/s;
  if (length($nextPage) < 90 and length($nextPage) > 1){
      getGalPage($baseURL . $nextPage);
  }


}

sub getDataPage{
    print "getDataPage \n";
  my $subTarget = shift; 
  $subTarget =~ s/\'/%27/g ;
  print "--------runing curl --retry 30 -s \'$subTarget\' \n\n";
  $dataPage = `curl --retry 30  -s \'$subTarget\'`;
  
  #make dups to run regex on
  $imgLink = $dataPage;
  $title = $dataPage;
  $created = $dataPage;
  $notes = $dataPage;
  $subjects = $dataPage;
  $names = $dataPage;
  $medium = $dataPage;
  $callNumber = $dataPage;
  $repository = $dataPage;
  $digitalId = $dataPage;
  
  #find the correct section using regex
  $imgLink =~ s/.*\<A HREF=\"(.*?)\"\>\W?\<IMG SRC.*?\>.*/\1/s;

  #print $imgLink ."\n";

  $title =~ s/.*TITLE:(.*?)CALL.*/\1/s;
  $created =~ s/.*CREATED.PUBLISHED:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  $notes =~ s/.*NOTES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  $subjects =~ s/.*SUBJECTS:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  $names =~ s/.*RELATED.NAMES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  $medium =~ s/.*MEDIUM:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  $callNumber =~ s/.*CALL.NUMBER:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  
  $repository =~ s/.*REPOSITORY:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  
  $digitalId =~ s/.*DIGITAL.ID:(.*?)\<P\>.*/\1/s;
  
  #strip html
  $title =~ s/\<[b-zB-Z\/]{1,5}\>|\&nbsp\;|\n|,|\'//g;
  $created =~ s/\<.{1,5}\>|\n|,|\'|\"//g;
  $notes =~ s/\<.{1,5}\>|\n|,|\'|\"//g;
  $subjects =~ s/\<[b-zB-Z\/]{1,5}\>|\n|,|\'//g;
  $names =~ s/\<.{1,5}\>|\n|,|\'//g;
  $medium =~ s/\<.{1,5}\>|\n|,|\'//g;
  $callNumber =~ s/\<.{1,5}\>|\n|,|\'//g;
  $repository =~ s/\<.{1,5}\>|\n|,|\'//g;
  $digitalId =~ s/\<.{1,5}\>|\n|,|\'//g;

  #make links stop being relative
  $title =~  s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  $created =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  $notes =~  s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  $subjects =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  $names =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  $medium =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  $callNumber =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  $repository =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  $digitalId =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;

  
  #check to make sure that we don't paste a whole page into a field
  if (length($imgLink) > {500}){$imgLink = ''};
  if (length($title) > 1500){$title = ''};
  if (length($created) > 1500){$created = ''};
  if (length($notes ) > 1500){$notes = ''};
  if (length($subjects ) >1500){$subjects = ''};
  if (length($names ) > 1500){$names = ''};
  if (length($medium ) >1500){$medium = ''};
  if (length($callNumber ) > 1500){$callNumber = ''};
  if (length($repository ) > 1500){$repository = ''};
  if (length($digitalId ) > 1500){$digitalId = ''};
  
  #specific request for the first date to be appended to file names
  $firstDate = $created;
  $firstDate =~ s/.*?(\d\d\d\d).*/\1/ ;
  if ($firstDate =~ /.{5}/){$firstDate = ''};

  #$uniqueMarker = $$; #use the pid if we are forking
  $uniqueMarker = int(rand(9999)); #use the pid if we are forking

  #set the name the image will be saved under
  $imgName = $title;
  $imgName =~ s/\<.*?\>|\W//g;
  $imgName =~ s/(.{0,23}).*/\1/;
  $imgName = $uniqueMarker. $imgName . "_" . $firstDate;


  ##debug only
#     print  "\n title". $title;
#     print  "\n created" . $created;
#     print  "\n notes" . $notes;
#     print  "\n subjects" . $subjects;
#     print  "\n names" . $names;
#     print  "\n medium" . $medium;
#     print  "\n callNumber" . $callNumber;
#     print  "\n repository" . $repository;
#     print  "\n digitalId" . $digitalId;
#     print  "\n writable img name" . $imgName.$imgType;
#   print  "\n";


  if (!$imgLink || length($imgLink) > 100 ){
    print "No downloadable image\n";
    #    die( "$$ Appears to be a Gallery Page\n");
  } else {
    #fetch the image
    getImageFromImagePage($baseURL . $imgLink);
  }
} # close sub getDataPage

sub getImageFromImagePage{
    print "getImageFromImagePage \n";
    my $imgPageUrl = shift;
    $imgPageUrl =~ s/\'/%27/g ;
    #print $imgPageUrl . "\n";
    $imgPage = `curl --retry 30  -s \'$imgPageUrl\'`;
    $imgPage =~ s/.*\<a href=\"(.*?)\"\>Retrieve.*/\1/s ; 
    if (length($imgPage) > 200 ){
	print "can't find image Link for $imgName \n";
    }
    $imgType = $imgPage;
    $imgType =~ s/.*(\....)$/\1/;
  # Lock and write out data to our output file;
    $lockfile="lock_the_file.loc";
    while (-e $lockfile) {
      print "sleep for $$ \n";
      sleep 2;
    }
    open (LOCK,">$lockfile") || die ("Cannot open lock file!\n");
    close (LOCK);
  
    open(DAT,">>$outputFile") || die("Cannot Open $outputFile ");
    print DAT "\n". $title;
    print DAT ',' . $created;
    print DAT ',' . $notes;
    print DAT ',' . $subjects;
    print DAT ',' . $names;
    print DAT ',' . $medium;
    print DAT ',' . $callNumber;
    print DAT ',' . $repository;
    print DAT ',' . $digitalId;
    print DAT ',' . $imgName.$imgType;
    close(DAT);
  
    my $myCurl = "curl --retry 30 " . $imgPage." -o ".$imagesDir.$imgName.$imgType . "\n";

    open(CURL,">>$curlStatements") || die("Cannot Open $outputFile ");
    print CURL $myCurl;
    unlink($lockfile);

    print "\nRUNNING " . $myCurl ;
    system($myCurl);
} # close sub getImageFromImagePage