Just awful.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

379 lines
12 KiB

  1. =pod
  2. =head1 CONTEXT
  3. This little gem came to me by way of a craiglist job. I wound up making about a
  4. hundred bucks for something like 50 hours of work on this project before I
  5. cut the guy off. At which point he threatened legal action and promised me
  6. I'd never work in Colorado again.
  7. This sort of thing is why I don't take random craiglist jobs any more. It is
  8. also why people are scared of old Perl.
  9. -- BPB
  10. =cut
  11. ## $SubjectIndex ='http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';
  12. $SubjectIndex = 'http://lcweb2.loc.gov/pp/pgzhtml/pgzsubjindex1.html';
  13. ## set this line to the archives you want to download.
  14. my $curlStatements = "photoCurlStatements.txt";
  15. my $problemPageList = "photoProblemPages.txt";
  16. my $imagesDir = 'Photos/'; #name it whatever you like, just make sure
  17. my $outputFile = "photoData.csv"; #name it whatever you like
  18. my $progressFile = "photoProgress.txt"; #this tracks how far the script has gone (very coursely)
  19. my $baseURL = 'http://lcweb2.loc.gov'; # This really shouldn't be changed, script is not very re-usable
  20. my $lastPlacePage = 28; #the highest numbered place page
  21. #################################################
  22. # Programmers guide:
  23. # This is a non-generalised script for downloading
  24. # map files and information that can be found at
  25. # http://lcweb2.loc.gov/ammem/gmdhtml/gmdgeogindex1.html
  26. # Major Subsections and functions
  27. # getPlacePage(placePageUrl)
  28. # This function fetches the Place Page list and then
  29. # goes through it one item at a time passing the urls
  30. # to detectPageType
  31. # detectPageType(unknownPageUrl)
  32. # Takes a page URL, and figures out if it is a gallery
  33. # page, or a data page. If it is a gallery page it also
  34. # attempts to also find if there are more in the series.
  35. # it then runs getGalPage for each one in the series, or if
  36. # it is a data page, it forks of a getDataPage for it.
  37. # --------------------
  38. # getGalPage(targetPageUrl)
  39. # This function takes in a gallery page and forks off
  40. # a new Process for each link on it. The process then
  41. # runs getDataPage
  42. # getDataPage(dataPageUrl)
  43. # This script attempts to parse the page it has reached
  44. # clean up the data, and then put it into the CSV file
  45. # then it takes the image link it has found and passes
  46. # it to getImageFromImagePage
  47. # getImageFromImagePage(imagePageUrl)
  48. # This visits the Image Page Url, and then does the
  49. # parsing nessisary to actually find the image files
  50. # Url. It then uses exec to launch a copy of curl to
  51. # Download the the (often large) file. Because it
  52. # uses exec the process dies here.
  53. #################################################
  54. #$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?ammem/gmd:@FILREQ(@field(SUBJ+@od1(Bird%27s-eye+view+prints--1860-1870+))+@FIELD(COLLID+citymap))';
  55. #$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/d?gmd:20:./temp/~ammem_1fen:';
  56. #getGalPage($targetPage);
  57. #detectPageType($mysPage);
  58. # if (open PROGRESSFILE, "<$progressFile"){
  59. # my @file = <PROGRESSFILE>;
  60. # close PROGRESSFILE;
  61. # $start = chomp($file[0]);
  62. # print "Progress File says start at $start \n";
  63. # } else {
  64. # print "No Progress File found so starting at the begining. \n";
  65. # my $start = 1;
  66. # }
  67. #$place = 'http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';
  68. #getPlacePage($place);
  69. $photo = 'http://lcweb2.loc.gov/cgi-bin/query/I?ils:1:./temp/~pp_GhEo::displayType=1:m856sd=cph:m856sf=3b28121:@@@';
  70. #getImageFromImagePage($photo);
  71. $dataPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Galatasaray+Mekteb-i+Sultanisi--Buildings--1880-1900+))+@FIELD(COLLID+ahii))';
  72. #getDataPage($dataPage);
  73. $galleryPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Garden+rooms--Turkey--Istanbul--1880-1900+))+@FIELD(COLLID+ahii))';
  74. #detectPageType($galleryPage);
  75. #detectPageType($dataPage);
  76. $subjectsPage ='http://lcweb2.loc.gov/pp/ahiihtml/ahiiSubjects04.html';
  77. #getPlacePage($subjectsPage);
  78. getSubjectPage($SubjectIndex);
  79. #getCollections();
  80. #no longer being used
  81. sub getCollections{
  82. print "getCollection \n";
  83. my $collectionPageUrl = 'http://lcweb2.loc.gov/pp/pphome.html';
  84. $collectionPage = `curl --retry 30 -s \'$collectionPageUrl\'`;
  85. $_ = $collectionPage;
  86. @collectionLinks = /\<A HREF=\".*?\"/gs;
  87. foreach(@collectionLinks) {
  88. $subjectPageUrl = $_;
  89. $subjectPageUrl =~ s/\<A HREF=\"(.*?)\"/\1/;
  90. #print $baseURL . "/pp/" . $subjectPageUrl. " ****\n";
  91. $subSubUrl = $baseURL . "/pp/" . $subjectPageUrl. "\n";
  92. $subjectSubpage = `curl --retry 30 -s \'$subSubUrl\'`;
  93. if($subjectSubpage =~ /Subject.and.format.headings/s){
  94. $subjectSubpage =~ s/.*Browse.*?\<A HREF=\"(.*?)\"\>.*/\1/s ;
  95. if (length($subjectSubpage) > 150){
  96. #print length($subjectSubpage) . " wrong kind of page2\n";
  97. }else {
  98. $getpage = $baseURL . "/pp/" . $subjectSubpage;
  99. print "!!!!!!!!! " .$getpage . " !!! \n\n\n";
  100. getSubjectPage($getpage);
  101. }
  102. } else {
  103. #print length($subjectSubpage) . " wrong kind of page\n";
  104. }
  105. }
  106. }
  107. sub getSubjectPage{
  108. print "getSubjectPage \n";
  109. my $localSubjectPageUrl = shift;
  110. $localSubjectPage = `curl --retry 30 -s \'$localSubjectPageUrl\'`;
  111. $_ = $localSubjectPage;
  112. @subjectLinks = /From.*?\<a href=\".*?\"\>/gs;
  113. $base = $localSubjectPageUrl;
  114. $base =~ s/(.*)\/.*?$/\1/s ;
  115. foreach(@subjectLinks) {
  116. $dataPageUrl = $_;
  117. $dataPageUrl =~ s/From.*\<a href=\"(.*?)\"\>/\1/;
  118. print $base ."/" .$dataPageUrl. "000 \n";
  119. getPlacePage( $base ."/" .$dataPageUrl);
  120. }
  121. }
  122. sub getPlacePage{
  123. print "getPlacePage \n";
  124. my $localPlacePageUrl = shift;
  125. $localPlacePage = `curl --retry 30 -s \'$localPlacePageUrl\'`;
  126. $_ = $localPlacePage;
  127. @placeLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;
  128. foreach(@placeLinks) {
  129. $dataPageUrl = $_;
  130. $dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;
  131. print $dataPageUrl;
  132. detectPageType($baseURL . $dataPageUrl);
  133. }
  134. }
  135. sub detectPageType{
  136. print "detectPageType \n";
  137. my $mysteryPageUrl = shift;
  138. #print "FINDING page type for $mysteryPageUrl\n";
  139. $mysteryPage = `curl --retry 30 -s \'$mysteryPageUrl\'`;
  140. if ($mysteryPage =~ /\<title\>Search Results\<.title\>/s){
  141. getGalPage($mysteryPageUrl);
  142. } elsif ($mysteryPage =~ /\<title\>\d+?\<.title\>/s){
  143. getDataPage($mysteryPageUrl);
  144. } else {
  145. print "UNKNOWN page type for $mysteryPageUrl \n";
  146. }
  147. }
  148. sub getGalPage{
  149. my $targetPageUrl = shift;
  150. print "getGalPage $targetPageUrl \n";
  151. $galleryPage = `curl --retry 30 -s \'$targetPageUrl\'`;
  152. $_ = $galleryPage;
  153. @galLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;
  154. foreach(@galLinks) {
  155. $dataPageUrl = $_;
  156. $dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;
  157. #print( "glinks: ". $baseURL . $dataPageUrl ."\n");
  158. getDataPage($baseURL . $dataPageUrl);
  159. }
  160. $nextPage = $galleryPage;
  161. $nextPage =~ s/.*a href\=\"(.*?)\"\>NEXT PAGE.*/\1/s;
  162. if (length($nextPage) < 90 and length($nextPage) > 1){
  163. getGalPage($baseURL . $nextPage);
  164. }
  165. }
  166. sub getDataPage{
  167. print "getDataPage \n";
  168. my $subTarget = shift;
  169. $subTarget =~ s/\'/%27/g ;
  170. print "--------runing curl --retry 30 -s \'$subTarget\' \n\n";
  171. $dataPage = `curl --retry 30 -s \'$subTarget\'`;
  172. #make dups to run regex on
  173. $imgLink = $dataPage;
  174. $title = $dataPage;
  175. $created = $dataPage;
  176. $notes = $dataPage;
  177. $subjects = $dataPage;
  178. $names = $dataPage;
  179. $medium = $dataPage;
  180. $callNumber = $dataPage;
  181. $repository = $dataPage;
  182. $digitalId = $dataPage;
  183. #find the correct section using regex
  184. $imgLink =~ s/.*\<A HREF=\"(.*?)\"\>\W?\<IMG SRC.*?\>.*/\1/s;
  185. #print $imgLink ."\n";
  186. $title =~ s/.*TITLE:(.*?)CALL.*/\1/s;
  187. $created =~ s/.*CREATED.PUBLISHED:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  188. $notes =~ s/.*NOTES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  189. $subjects =~ s/.*SUBJECTS:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  190. $names =~ s/.*RELATED.NAMES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  191. $medium =~ s/.*MEDIUM:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  192. $callNumber =~ s/.*CALL.NUMBER:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  193. $repository =~ s/.*REPOSITORY:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
  194. $digitalId =~ s/.*DIGITAL.ID:(.*?)\<P\>.*/\1/s;
  195. #strip html
  196. $title =~ s/\<[b-zB-Z\/]{1,5}\>|\&nbsp\;|\n|,|\'//g;
  197. $created =~ s/\<.{1,5}\>|\n|,|\'|\"//g;
  198. $notes =~ s/\<.{1,5}\>|\n|,|\'|\"//g;
  199. $subjects =~ s/\<[b-zB-Z\/]{1,5}\>|\n|,|\'//g;
  200. $names =~ s/\<.{1,5}\>|\n|,|\'//g;
  201. $medium =~ s/\<.{1,5}\>|\n|,|\'//g;
  202. $callNumber =~ s/\<.{1,5}\>|\n|,|\'//g;
  203. $repository =~ s/\<.{1,5}\>|\n|,|\'//g;
  204. $digitalId =~ s/\<.{1,5}\>|\n|,|\'//g;
  205. #make links stop being relative
  206. $title =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  207. $created =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  208. $notes =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  209. $subjects =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  210. $names =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  211. $medium =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  212. $callNumber =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  213. $repository =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  214. $digitalId =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
  215. #check to make sure that we don't paste a whole page into a field
  216. if (length($imgLink) > {500}){$imgLink = ''};
  217. if (length($title) > 1500){$title = ''};
  218. if (length($created) > 1500){$created = ''};
  219. if (length($notes ) > 1500){$notes = ''};
  220. if (length($subjects ) >1500){$subjects = ''};
  221. if (length($names ) > 1500){$names = ''};
  222. if (length($medium ) >1500){$medium = ''};
  223. if (length($callNumber ) > 1500){$callNumber = ''};
  224. if (length($repository ) > 1500){$repository = ''};
  225. if (length($digitalId ) > 1500){$digitalId = ''};
  226. #specific request for the first date to be appended to file names
  227. $firstDate = $created;
  228. $firstDate =~ s/.*?(\d\d\d\d).*/\1/ ;
  229. if ($firstDate =~ /.{5}/){$firstDate = ''};
  230. #$uniqueMarker = $$; #use the pid if we are forking
  231. $uniqueMarker = int(rand(9999)); #use the pid if we are forking
  232. #set the name the image will be saved under
  233. $imgName = $title;
  234. $imgName =~ s/\<.*?\>|\W//g;
  235. $imgName =~ s/(.{0,23}).*/\1/;
  236. $imgName = $uniqueMarker. $imgName . "_" . $firstDate;
  237. ##debug only
  238. # print "\n title". $title;
  239. # print "\n created" . $created;
  240. # print "\n notes" . $notes;
  241. # print "\n subjects" . $subjects;
  242. # print "\n names" . $names;
  243. # print "\n medium" . $medium;
  244. # print "\n callNumber" . $callNumber;
  245. # print "\n repository" . $repository;
  246. # print "\n digitalId" . $digitalId;
  247. # print "\n writable img name" . $imgName.$imgType;
  248. # print "\n";
  249. if (!$imgLink || length($imgLink) > 100 ){
  250. print "No downloadable image\n";
  251. # die( "$$ Appears to be a Gallery Page\n");
  252. } else {
  253. #fetch the image
  254. getImageFromImagePage($baseURL . $imgLink);
  255. }
  256. } # close sub getDataPage
  257. sub getImageFromImagePage{
  258. print "getImageFromImagePage \n";
  259. my $imgPageUrl = shift;
  260. $imgPageUrl =~ s/\'/%27/g ;
  261. #print $imgPageUrl . "\n";
  262. $imgPage = `curl --retry 30 -s \'$imgPageUrl\'`;
  263. $imgPage =~ s/.*\<a href=\"(.*?)\"\>Retrieve.*/\1/s ;
  264. if (length($imgPage) > 200 ){
  265. print "can't find image Link for $imgName \n";
  266. }
  267. $imgType = $imgPage;
  268. $imgType =~ s/.*(\....)$/\1/;
  269. # Lock and write out data to our output file;
  270. $lockfile="lock_the_file.loc";
  271. while (-e $lockfile) {
  272. print "sleep for $$ \n";
  273. sleep 2;
  274. }
  275. open (LOCK,">$lockfile") || die ("Cannot open lock file!\n");
  276. close (LOCK);
  277. open(DAT,">>$outputFile") || die("Cannot Open $outputFile ");
  278. print DAT "\n". $title;
  279. print DAT ',' . $created;
  280. print DAT ',' . $notes;
  281. print DAT ',' . $subjects;
  282. print DAT ',' . $names;
  283. print DAT ',' . $medium;
  284. print DAT ',' . $callNumber;
  285. print DAT ',' . $repository;
  286. print DAT ',' . $digitalId;
  287. print DAT ',' . $imgName.$imgType;
  288. close(DAT);
  289. my $myCurl = "curl --retry 30 " . $imgPage." -o ".$imagesDir.$imgName.$imgType . "\n";
  290. open(CURL,">>$curlStatements") || die("Cannot Open $outputFile ");
  291. print CURL $myCurl;
  292. unlink($lockfile);
  293. print "\nRUNNING " . $myCurl ;
  294. system($myCurl);
  295. } # close sub getImageFromImagePage