Just awful.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

379 lines
12 KiB

=pod
=head1 CONTEXT
This little gem came to me by way of a craiglist job. I wound up making about a
hundred bucks for something like 50 hours of work on this project before I
cut the guy off. At which point he threatened legal action and promised me
I'd never work in Colorado again.
This sort of thing is why I don't take random craiglist jobs any more. It is
also why people are scared of old Perl.
-- BPB
=cut
## $SubjectIndex ='http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';
$SubjectIndex = 'http://lcweb2.loc.gov/pp/pgzhtml/pgzsubjindex1.html';
## set this line to the archives you want to download.
my $curlStatements = "photoCurlStatements.txt";
my $problemPageList = "photoProblemPages.txt";
my $imagesDir = 'Photos/'; #name it whatever you like, just make sure
my $outputFile = "photoData.csv"; #name it whatever you like
my $progressFile = "photoProgress.txt"; #this tracks how far the script has gone (very coursely)
my $baseURL = 'http://lcweb2.loc.gov'; # This really shouldn't be changed, script is not very re-usable
my $lastPlacePage = 28; #the highest numbered place page
#################################################
# Programmers guide:
# This is a non-generalised script for downloading
# map files and information that can be found at
# http://lcweb2.loc.gov/ammem/gmdhtml/gmdgeogindex1.html
# Major Subsections and functions
# getPlacePage(placePageUrl)
# This function fetches the Place Page list and then
# goes through it one item at a time passing the urls
# to detectPageType
# detectPageType(unknownPageUrl)
# Takes a page URL, and figures out if it is a gallery
# page, or a data page. If it is a gallery page it also
# attempts to also find if there are more in the series.
# it then runs getGalPage for each one in the series, or if
# it is a data page, it forks of a getDataPage for it.
# --------------------
# getGalPage(targetPageUrl)
# This function takes in a gallery page and forks off
# a new Process for each link on it. The process then
# runs getDataPage
# getDataPage(dataPageUrl)
# This script attempts to parse the page it has reached
# clean up the data, and then put it into the CSV file
# then it takes the image link it has found and passes
# it to getImageFromImagePage
# getImageFromImagePage(imagePageUrl)
# This visits the Image Page Url, and then does the
# parsing nessisary to actually find the image files
# Url. It then uses exec to launch a copy of curl to
# Download the the (often large) file. Because it
# uses exec the process dies here.
#################################################
#$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?ammem/gmd:@FILREQ(@field(SUBJ+@od1(Bird%27s-eye+view+prints--1860-1870+))+@FIELD(COLLID+citymap))';
#$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/d?gmd:20:./temp/~ammem_1fen:';
#getGalPage($targetPage);
#detectPageType($mysPage);
# if (open PROGRESSFILE, "<$progressFile"){
# my @file = <PROGRESSFILE>;
# close PROGRESSFILE;
# $start = chomp($file[0]);
# print "Progress File says start at $start \n";
# } else {
# print "No Progress File found so starting at the begining. \n";
# my $start = 1;
# }
#$place = 'http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';
#getPlacePage($place);
$photo = 'http://lcweb2.loc.gov/cgi-bin/query/I?ils:1:./temp/~pp_GhEo::displayType=1:m856sd=cph:m856sf=3b28121:@@@';
#getImageFromImagePage($photo);
$dataPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Galatasaray+Mekteb-i+Sultanisi--Buildings--1880-1900+))+@FIELD(COLLID+ahii))';
#getDataPage($dataPage);
$galleryPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Garden+rooms--Turkey--Istanbul--1880-1900+))+@FIELD(COLLID+ahii))';
#detectPageType($galleryPage);
#detectPageType($dataPage);
$subjectsPage ='http://lcweb2.loc.gov/pp/ahiihtml/ahiiSubjects04.html';
#getPlacePage($subjectsPage);
getSubjectPage($SubjectIndex);
#getCollections();
#no longer being used
sub getCollections{
print "getCollection \n";
my $collectionPageUrl = 'http://lcweb2.loc.gov/pp/pphome.html';
$collectionPage = `curl --retry 30 -s \'$collectionPageUrl\'`;
$_ = $collectionPage;
@collectionLinks = /\<A HREF=\".*?\"/gs;
foreach(@collectionLinks) {
$subjectPageUrl = $_;
$subjectPageUrl =~ s/\<A HREF=\"(.*?)\"/\1/;
#print $baseURL . "/pp/" . $subjectPageUrl. " ****\n";
$subSubUrl = $baseURL . "/pp/" . $subjectPageUrl. "\n";
$subjectSubpage = `curl --retry 30 -s \'$subSubUrl\'`;
if($subjectSubpage =~ /Subject.and.format.headings/s){
$subjectSubpage =~ s/.*Browse.*?\<A HREF=\"(.*?)\"\>.*/\1/s ;
if (length($subjectSubpage) > 150){
#print length($subjectSubpage) . " wrong kind of page2\n";
}else {
$getpage = $baseURL . "/pp/" . $subjectSubpage;
print "!!!!!!!!! " .$getpage . " !!! \n\n\n";
getSubjectPage($getpage);
}
} else {
#print length($subjectSubpage) . " wrong kind of page\n";
}
}
}
sub getSubjectPage{
print "getSubjectPage \n";
my $localSubjectPageUrl = shift;
$localSubjectPage = `curl --retry 30 -s \'$localSubjectPageUrl\'`;
$_ = $localSubjectPage;
@subjectLinks = /From.*?\<a href=\".*?\"\>/gs;
$base = $localSubjectPageUrl;
$base =~ s/(.*)\/.*?$/\1/s ;
foreach(@subjectLinks) {
$dataPageUrl = $_;
$dataPageUrl =~ s/From.*\<a href=\"(.*?)\"\>/\1/;
print $base ."/" .$dataPageUrl. "000 \n";
getPlacePage( $base ."/" .$dataPageUrl);
}
}
sub getPlacePage{
print "getPlacePage \n";
my $localPlacePageUrl = shift;
$localPlacePage = `curl --retry 30 -s \'$localPlacePageUrl\'`;
$_ = $localPlacePage;
@placeLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;
foreach(@placeLinks) {
$dataPageUrl = $_;
$dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;
print $dataPageUrl;
detectPageType($baseURL . $dataPageUrl);
}
}
sub detectPageType{
print "detectPageType \n";
my $mysteryPageUrl = shift;
#print "FINDING page type for $mysteryPageUrl\n";
$mysteryPage = `curl --retry 30 -s \'$mysteryPageUrl\'`;
if ($mysteryPage =~ /\<title\>Search Results\<.title\>/s){
getGalPage($mysteryPageUrl);
} elsif ($mysteryPage =~ /\<title\>\d+?\<.title\>/s){
getDataPage($mysteryPageUrl);
} else {
print "UNKNOWN page type for $mysteryPageUrl \n";
}
}
sub getGalPage{
my $targetPageUrl = shift;
print "getGalPage $targetPageUrl \n";
$galleryPage = `curl --retry 30 -s \'$targetPageUrl\'`;
$_ = $galleryPage;
@galLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;
foreach(@galLinks) {
$dataPageUrl = $_;
$dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;
#print( "glinks: ". $baseURL . $dataPageUrl ."\n");
getDataPage($baseURL . $dataPageUrl);
}
$nextPage = $galleryPage;
$nextPage =~ s/.*a href\=\"(.*?)\"\>NEXT PAGE.*/\1/s;
if (length($nextPage) < 90 and length($nextPage) > 1){
getGalPage($baseURL . $nextPage);
}
}
sub getDataPage{
print "getDataPage \n";
my $subTarget = shift;
$subTarget =~ s/\'/%27/g ;
print "--------runing curl --retry 30 -s \'$subTarget\' \n\n";
$dataPage = `curl --retry 30 -s \'$subTarget\'`;
#make dups to run regex on
$imgLink = $dataPage;
$title = $dataPage;
$created = $dataPage;
$notes = $dataPage;
$subjects = $dataPage;
$names = $dataPage;
$medium = $dataPage;
$callNumber = $dataPage;
$repository = $dataPage;
$digitalId = $dataPage;
#find the correct section using regex
$imgLink =~ s/.*\<A HREF=\"(.*?)\"\>\W?\<IMG SRC.*?\>.*/\1/s;
#print $imgLink ."\n";
$title =~ s/.*TITLE:(.*?)CALL.*/\1/s;
$created =~ s/.*CREATED.PUBLISHED:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
$notes =~ s/.*NOTES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
$subjects =~ s/.*SUBJECTS:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
$names =~ s/.*RELATED.NAMES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
$medium =~ s/.*MEDIUM:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
$callNumber =~ s/.*CALL.NUMBER:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
$repository =~ s/.*REPOSITORY:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
$digitalId =~ s/.*DIGITAL.ID:(.*?)\<P\>.*/\1/s;
#strip html
$title =~ s/\<[b-zB-Z\/]{1,5}\>|\&nbsp\;|\n|,|\'//g;
$created =~ s/\<.{1,5}\>|\n|,|\'|\"//g;
$notes =~ s/\<.{1,5}\>|\n|,|\'|\"//g;
$subjects =~ s/\<[b-zB-Z\/]{1,5}\>|\n|,|\'//g;
$names =~ s/\<.{1,5}\>|\n|,|\'//g;
$medium =~ s/\<.{1,5}\>|\n|,|\'//g;
$callNumber =~ s/\<.{1,5}\>|\n|,|\'//g;
$repository =~ s/\<.{1,5}\>|\n|,|\'//g;
$digitalId =~ s/\<.{1,5}\>|\n|,|\'//g;
#make links stop being relative
$title =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
$created =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
$notes =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
$subjects =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
$names =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
$medium =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
$callNumber =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
$repository =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
$digitalId =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
#check to make sure that we don't paste a whole page into a field
if (length($imgLink) > {500}){$imgLink = ''};
if (length($title) > 1500){$title = ''};
if (length($created) > 1500){$created = ''};
if (length($notes ) > 1500){$notes = ''};
if (length($subjects ) >1500){$subjects = ''};
if (length($names ) > 1500){$names = ''};
if (length($medium ) >1500){$medium = ''};
if (length($callNumber ) > 1500){$callNumber = ''};
if (length($repository ) > 1500){$repository = ''};
if (length($digitalId ) > 1500){$digitalId = ''};
#specific request for the first date to be appended to file names
$firstDate = $created;
$firstDate =~ s/.*?(\d\d\d\d).*/\1/ ;
if ($firstDate =~ /.{5}/){$firstDate = ''};
#$uniqueMarker = $$; #use the pid if we are forking
$uniqueMarker = int(rand(9999)); #use the pid if we are forking
#set the name the image will be saved under
$imgName = $title;
$imgName =~ s/\<.*?\>|\W//g;
$imgName =~ s/(.{0,23}).*/\1/;
$imgName = $uniqueMarker. $imgName . "_" . $firstDate;
##debug only
# print "\n title". $title;
# print "\n created" . $created;
# print "\n notes" . $notes;
# print "\n subjects" . $subjects;
# print "\n names" . $names;
# print "\n medium" . $medium;
# print "\n callNumber" . $callNumber;
# print "\n repository" . $repository;
# print "\n digitalId" . $digitalId;
# print "\n writable img name" . $imgName.$imgType;
# print "\n";
if (!$imgLink || length($imgLink) > 100 ){
print "No downloadable image\n";
# die( "$$ Appears to be a Gallery Page\n");
} else {
#fetch the image
getImageFromImagePage($baseURL . $imgLink);
}
} # close sub getDataPage
sub getImageFromImagePage{
print "getImageFromImagePage \n";
my $imgPageUrl = shift;
$imgPageUrl =~ s/\'/%27/g ;
#print $imgPageUrl . "\n";
$imgPage = `curl --retry 30 -s \'$imgPageUrl\'`;
$imgPage =~ s/.*\<a href=\"(.*?)\"\>Retrieve.*/\1/s ;
if (length($imgPage) > 200 ){
print "can't find image Link for $imgName \n";
}
$imgType = $imgPage;
$imgType =~ s/.*(\....)$/\1/;
# Lock and write out data to our output file;
$lockfile="lock_the_file.loc";
while (-e $lockfile) {
print "sleep for $$ \n";
sleep 2;
}
open (LOCK,">$lockfile") || die ("Cannot open lock file!\n");
close (LOCK);
open(DAT,">>$outputFile") || die("Cannot Open $outputFile ");
print DAT "\n". $title;
print DAT ',' . $created;
print DAT ',' . $notes;
print DAT ',' . $subjects;
print DAT ',' . $names;
print DAT ',' . $medium;
print DAT ',' . $callNumber;
print DAT ',' . $repository;
print DAT ',' . $digitalId;
print DAT ',' . $imgName.$imgType;
close(DAT);
my $myCurl = "curl --retry 30 " . $imgPage." -o ".$imagesDir.$imgName.$imgType . "\n";
open(CURL,">>$curlStatements") || die("Cannot Open $outputFile ");
print CURL $myCurl;
unlink($lockfile);
print "\nRUNNING " . $myCurl ;
system($myCurl);
} # close sub getImageFromImagePage