|
|
- =pod
-
- =head1 CONTEXT
-
- This little gem came to me by way of a craiglist job. I wound up making about a
- hundred bucks for something like 50 hours of work on this project before I
- cut the guy off. At which point he threatened legal action and promised me
- I'd never work in Colorado again.
-
- This sort of thing is why I don't take random craiglist jobs any more. It is
- also why people are scared of old Perl.
-
- -- BPB
-
- =cut
-
-
- ## $SubjectIndex ='http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';
-
- $SubjectIndex = 'http://lcweb2.loc.gov/pp/pgzhtml/pgzsubjindex1.html';
-
- ## set this line to the archives you want to download.
-
-
- my $curlStatements = "photoCurlStatements.txt";
- my $problemPageList = "photoProblemPages.txt";
- my $imagesDir = 'Photos/'; #name it whatever you like, just make sure
- my $outputFile = "photoData.csv"; #name it whatever you like
- my $progressFile = "photoProgress.txt"; #this tracks how far the script has gone (very coursely)
-
- my $baseURL = 'http://lcweb2.loc.gov'; # This really shouldn't be changed, script is not very re-usable
- my $lastPlacePage = 28; #the highest numbered place page
-
- #################################################
- # Programmers guide:
- # This is a non-generalised script for downloading
- # map files and information that can be found at
- # http://lcweb2.loc.gov/ammem/gmdhtml/gmdgeogindex1.html
-
- # Major Subsections and functions
- # getPlacePage(placePageUrl)
- # This function fetches the Place Page list and then
- # goes through it one item at a time passing the urls
- # to detectPageType
-
- # detectPageType(unknownPageUrl)
- # Takes a page URL, and figures out if it is a gallery
- # page, or a data page. If it is a gallery page it also
- # attempts to also find if there are more in the series.
- # it then runs getGalPage for each one in the series, or if
- # it is a data page, it forks of a getDataPage for it.
- # --------------------
-
- # getGalPage(targetPageUrl)
- # This function takes in a gallery page and forks off
- # a new Process for each link on it. The process then
- # runs getDataPage
-
- # getDataPage(dataPageUrl)
- # This script attempts to parse the page it has reached
- # clean up the data, and then put it into the CSV file
- # then it takes the image link it has found and passes
- # it to getImageFromImagePage
-
- # getImageFromImagePage(imagePageUrl)
- # This visits the Image Page Url, and then does the
- # parsing nessisary to actually find the image files
- # Url. It then uses exec to launch a copy of curl to
- # Download the the (often large) file. Because it
- # uses exec the process dies here.
- #################################################
-
-
- #$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?ammem/gmd:@FILREQ(@field(SUBJ+@od1(Bird%27s-eye+view+prints--1860-1870+))+@FIELD(COLLID+citymap))';
-
- #$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/d?gmd:20:./temp/~ammem_1fen:';
-
- #getGalPage($targetPage);
-
- #detectPageType($mysPage);
-
- # if (open PROGRESSFILE, "<$progressFile"){
- # my @file = <PROGRESSFILE>;
- # close PROGRESSFILE;
- # $start = chomp($file[0]);
- # print "Progress File says start at $start \n";
- # } else {
- # print "No Progress File found so starting at the begining. \n";
- # my $start = 1;
- # }
-
- #$place = 'http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';
- #getPlacePage($place);
-
- $photo = 'http://lcweb2.loc.gov/cgi-bin/query/I?ils:1:./temp/~pp_GhEo::displayType=1:m856sd=cph:m856sf=3b28121:@@@';
- #getImageFromImagePage($photo);
-
- $dataPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Galatasaray+Mekteb-i+Sultanisi--Buildings--1880-1900+))+@FIELD(COLLID+ahii))';
- #getDataPage($dataPage);
-
- $galleryPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Garden+rooms--Turkey--Istanbul--1880-1900+))+@FIELD(COLLID+ahii))';
- #detectPageType($galleryPage);
- #detectPageType($dataPage);
-
- $subjectsPage ='http://lcweb2.loc.gov/pp/ahiihtml/ahiiSubjects04.html';
- #getPlacePage($subjectsPage);
-
-
- getSubjectPage($SubjectIndex);
-
- #getCollections();
-
-
- #no longer being used
- sub getCollections{
- print "getCollection \n";
- my $collectionPageUrl = 'http://lcweb2.loc.gov/pp/pphome.html';
- $collectionPage = `curl --retry 30 -s \'$collectionPageUrl\'`;
- $_ = $collectionPage;
- @collectionLinks = /\<A HREF=\".*?\"/gs;
-
- foreach(@collectionLinks) {
- $subjectPageUrl = $_;
- $subjectPageUrl =~ s/\<A HREF=\"(.*?)\"/\1/;
- #print $baseURL . "/pp/" . $subjectPageUrl. " ****\n";
-
- $subSubUrl = $baseURL . "/pp/" . $subjectPageUrl. "\n";
- $subjectSubpage = `curl --retry 30 -s \'$subSubUrl\'`;
-
- if($subjectSubpage =~ /Subject.and.format.headings/s){
- $subjectSubpage =~ s/.*Browse.*?\<A HREF=\"(.*?)\"\>.*/\1/s ;
- if (length($subjectSubpage) > 150){
- #print length($subjectSubpage) . " wrong kind of page2\n";
- }else {
- $getpage = $baseURL . "/pp/" . $subjectSubpage;
- print "!!!!!!!!! " .$getpage . " !!! \n\n\n";
- getSubjectPage($getpage);
- }
- } else {
- #print length($subjectSubpage) . " wrong kind of page\n";
- }
- }
-
- }
-
-
-
-
- sub getSubjectPage{
- print "getSubjectPage \n";
- my $localSubjectPageUrl = shift;
- $localSubjectPage = `curl --retry 30 -s \'$localSubjectPageUrl\'`;
- $_ = $localSubjectPage;
- @subjectLinks = /From.*?\<a href=\".*?\"\>/gs;
-
- $base = $localSubjectPageUrl;
- $base =~ s/(.*)\/.*?$/\1/s ;
-
- foreach(@subjectLinks) {
- $dataPageUrl = $_;
- $dataPageUrl =~ s/From.*\<a href=\"(.*?)\"\>/\1/;
- print $base ."/" .$dataPageUrl. "000 \n";
- getPlacePage( $base ."/" .$dataPageUrl);
- }
- }
-
-
- sub getPlacePage{
- print "getPlacePage \n";
- my $localPlacePageUrl = shift;
- $localPlacePage = `curl --retry 30 -s \'$localPlacePageUrl\'`;
- $_ = $localPlacePage;
- @placeLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;
-
- foreach(@placeLinks) {
- $dataPageUrl = $_;
- $dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;
- print $dataPageUrl;
- detectPageType($baseURL . $dataPageUrl);
- }
-
- }
-
- sub detectPageType{
- print "detectPageType \n";
- my $mysteryPageUrl = shift;
-
- #print "FINDING page type for $mysteryPageUrl\n";
- $mysteryPage = `curl --retry 30 -s \'$mysteryPageUrl\'`;
-
- if ($mysteryPage =~ /\<title\>Search Results\<.title\>/s){
- getGalPage($mysteryPageUrl);
- } elsif ($mysteryPage =~ /\<title\>\d+?\<.title\>/s){
- getDataPage($mysteryPageUrl);
- } else {
- print "UNKNOWN page type for $mysteryPageUrl \n";
- }
-
- }
-
- sub getGalPage{
- my $targetPageUrl = shift;
-
- print "getGalPage $targetPageUrl \n";
- $galleryPage = `curl --retry 30 -s \'$targetPageUrl\'`;
-
- $_ = $galleryPage;
- @galLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;
-
- foreach(@galLinks) {
- $dataPageUrl = $_;
- $dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;
- #print( "glinks: ". $baseURL . $dataPageUrl ."\n");
- getDataPage($baseURL . $dataPageUrl);
- }
-
- $nextPage = $galleryPage;
- $nextPage =~ s/.*a href\=\"(.*?)\"\>NEXT PAGE.*/\1/s;
- if (length($nextPage) < 90 and length($nextPage) > 1){
- getGalPage($baseURL . $nextPage);
- }
-
-
- }
-
- sub getDataPage{
- print "getDataPage \n";
- my $subTarget = shift;
- $subTarget =~ s/\'/%27/g ;
- print "--------runing curl --retry 30 -s \'$subTarget\' \n\n";
- $dataPage = `curl --retry 30 -s \'$subTarget\'`;
-
- #make dups to run regex on
- $imgLink = $dataPage;
- $title = $dataPage;
- $created = $dataPage;
- $notes = $dataPage;
- $subjects = $dataPage;
- $names = $dataPage;
- $medium = $dataPage;
- $callNumber = $dataPage;
- $repository = $dataPage;
- $digitalId = $dataPage;
-
- #find the correct section using regex
- $imgLink =~ s/.*\<A HREF=\"(.*?)\"\>\W?\<IMG SRC.*?\>.*/\1/s;
-
- #print $imgLink ."\n";
-
- $title =~ s/.*TITLE:(.*?)CALL.*/\1/s;
- $created =~ s/.*CREATED.PUBLISHED:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
- $notes =~ s/.*NOTES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
- $subjects =~ s/.*SUBJECTS:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
- $names =~ s/.*RELATED.NAMES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
- $medium =~ s/.*MEDIUM:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
- $callNumber =~ s/.*CALL.NUMBER:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
-
- $repository =~ s/.*REPOSITORY:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
-
- $digitalId =~ s/.*DIGITAL.ID:(.*?)\<P\>.*/\1/s;
-
- #strip html
- $title =~ s/\<[b-zB-Z\/]{1,5}\>|\ \;|\n|,|\'//g;
- $created =~ s/\<.{1,5}\>|\n|,|\'|\"//g;
- $notes =~ s/\<.{1,5}\>|\n|,|\'|\"//g;
- $subjects =~ s/\<[b-zB-Z\/]{1,5}\>|\n|,|\'//g;
- $names =~ s/\<.{1,5}\>|\n|,|\'//g;
- $medium =~ s/\<.{1,5}\>|\n|,|\'//g;
- $callNumber =~ s/\<.{1,5}\>|\n|,|\'//g;
- $repository =~ s/\<.{1,5}\>|\n|,|\'//g;
- $digitalId =~ s/\<.{1,5}\>|\n|,|\'//g;
-
- #make links stop being relative
- $title =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
- $created =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
- $notes =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
- $subjects =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
- $names =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
- $medium =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
- $callNumber =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
- $repository =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
- $digitalId =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
-
-
- #check to make sure that we don't paste a whole page into a field
- if (length($imgLink) > {500}){$imgLink = ''};
- if (length($title) > 1500){$title = ''};
- if (length($created) > 1500){$created = ''};
- if (length($notes ) > 1500){$notes = ''};
- if (length($subjects ) >1500){$subjects = ''};
- if (length($names ) > 1500){$names = ''};
- if (length($medium ) >1500){$medium = ''};
- if (length($callNumber ) > 1500){$callNumber = ''};
- if (length($repository ) > 1500){$repository = ''};
- if (length($digitalId ) > 1500){$digitalId = ''};
-
- #specific request for the first date to be appended to file names
- $firstDate = $created;
- $firstDate =~ s/.*?(\d\d\d\d).*/\1/ ;
- if ($firstDate =~ /.{5}/){$firstDate = ''};
-
- #$uniqueMarker = $$; #use the pid if we are forking
- $uniqueMarker = int(rand(9999)); #use the pid if we are forking
-
- #set the name the image will be saved under
- $imgName = $title;
- $imgName =~ s/\<.*?\>|\W//g;
- $imgName =~ s/(.{0,23}).*/\1/;
- $imgName = $uniqueMarker. $imgName . "_" . $firstDate;
-
-
- ##debug only
- # print "\n title". $title;
- # print "\n created" . $created;
- # print "\n notes" . $notes;
- # print "\n subjects" . $subjects;
- # print "\n names" . $names;
- # print "\n medium" . $medium;
- # print "\n callNumber" . $callNumber;
- # print "\n repository" . $repository;
- # print "\n digitalId" . $digitalId;
- # print "\n writable img name" . $imgName.$imgType;
- # print "\n";
-
-
-
- if (!$imgLink || length($imgLink) > 100 ){
- print "No downloadable image\n";
- # die( "$$ Appears to be a Gallery Page\n");
- } else {
- #fetch the image
- getImageFromImagePage($baseURL . $imgLink);
- }
- } # close sub getDataPage
-
- sub getImageFromImagePage{
- print "getImageFromImagePage \n";
- my $imgPageUrl = shift;
- $imgPageUrl =~ s/\'/%27/g ;
- #print $imgPageUrl . "\n";
- $imgPage = `curl --retry 30 -s \'$imgPageUrl\'`;
- $imgPage =~ s/.*\<a href=\"(.*?)\"\>Retrieve.*/\1/s ;
- if (length($imgPage) > 200 ){
- print "can't find image Link for $imgName \n";
- }
- $imgType = $imgPage;
- $imgType =~ s/.*(\....)$/\1/;
- # Lock and write out data to our output file;
- $lockfile="lock_the_file.loc";
- while (-e $lockfile) {
- print "sleep for $$ \n";
- sleep 2;
- }
- open (LOCK,">$lockfile") || die ("Cannot open lock file!\n");
- close (LOCK);
-
- open(DAT,">>$outputFile") || die("Cannot Open $outputFile ");
- print DAT "\n". $title;
- print DAT ',' . $created;
- print DAT ',' . $notes;
- print DAT ',' . $subjects;
- print DAT ',' . $names;
- print DAT ',' . $medium;
- print DAT ',' . $callNumber;
- print DAT ',' . $repository;
- print DAT ',' . $digitalId;
- print DAT ',' . $imgName.$imgType;
- close(DAT);
-
- my $myCurl = "curl --retry 30 " . $imgPage." -o ".$imagesDir.$imgName.$imgType . "\n";
-
- open(CURL,">>$curlStatements") || die("Cannot Open $outputFile ");
- print CURL $myCurl;
- unlink($lockfile);
-
- print "\nRUNNING " . $myCurl ;
- system($myCurl);
- } # close sub getImageFromImagePage
-
|