Browse Source

nostalgia, of a sort

Brennen Bearnes 7 years ago
commit
71567b4a0d
1 changed files with 379 additions and 0 deletions
  1. 379
    0
      photoFetch.pl

+ 379
- 0
photoFetch.pl View File

@@ -0,0 +1,379 @@
1
+=pod
2
+
3
+=head1 CONTEXT
4
+
5
+This little gem came to me by way of a craiglist job. I wound up making about a
6
+hundred bucks for something like 50 hours of work on this project before I
7
+cut the guy off. At which point he threatened legal action and promised me
8
+I'd never work in Colorado again.
9
+
10
+This sort of thing is why I don't take random craiglist jobs any more. It is
11
+also why people are scared of old Perl.
12
+
13
+-- BPB
14
+
15
+=cut
16
+
17
+
18
+## $SubjectIndex ='http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';
19
+
20
+$SubjectIndex = 'http://lcweb2.loc.gov/pp/pgzhtml/pgzsubjindex1.html';
21
+
22
+## set this line to the archives you want to download.
23
+
24
+
25
+my $curlStatements = "photoCurlStatements.txt";
26
+my $problemPageList = "photoProblemPages.txt";
27
+my $imagesDir = 'Photos/';  #name it whatever you like, just make sure 
28
+my $outputFile = "photoData.csv";  #name it whatever you like 
29
+my $progressFile = "photoProgress.txt"; #this tracks how far the script has gone (very coursely)
30
+
31
+my $baseURL = 'http://lcweb2.loc.gov';   # This really shouldn't be changed, script is not very re-usable
32
+my $lastPlacePage = 28; #the highest numbered place page
33
+
34
+#################################################
35
+# Programmers guide:
36
+# This is a non-generalised script for downloading
37
+# map files and information that can be found at
38
+# http://lcweb2.loc.gov/ammem/gmdhtml/gmdgeogindex1.html
39
+
40
+# Major Subsections and functions
41
+# getPlacePage(placePageUrl)
42
+#   This function fetches the Place Page list and then
43
+#   goes through it one item at a time passing the urls
44
+#   to detectPageType
45
+
46
+# detectPageType(unknownPageUrl)
47
+#   Takes a page URL, and figures out if it is a gallery
48
+#   page, or a data page. If it is a gallery page it also
49
+#   attempts to also find if there are more in the series.
50
+#   it then runs getGalPage for each one in the series, or if
51
+#   it is a data page, it forks of a getDataPage for it.
52
+# --------------------
53
+
54
+# getGalPage(targetPageUrl)
55
+#   This function takes in a gallery page and forks off
56
+#   a new Process for each link on it. The process then
57
+#   runs getDataPage
58
+
59
+# getDataPage(dataPageUrl)
60
+#   This script attempts to parse the page it has reached
61
+#   clean up the data, and then put it into the CSV file
62
+#   then it takes the image link it has found and passes
63
+#   it to getImageFromImagePage
64
+
65
+# getImageFromImagePage(imagePageUrl)
66
+#   This visits the Image Page Url, and then does the
67
+#   parsing nessisary to actually find the image files
68
+#   Url. It then uses exec to launch a copy of curl to
69
+#   Download the the (often large) file. Because it
70
+#   uses exec the process dies here. 
71
+#################################################
72
+
73
+
74
+#$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?ammem/gmd:@FILREQ(@field(SUBJ+@od1(Bird%27s-eye+view+prints--1860-1870+))+@FIELD(COLLID+citymap))';
75
+
76
+#$targetPage = 'http://lcweb2.loc.gov/cgi-bin/query/d?gmd:20:./temp/~ammem_1fen:';
77
+
78
+#getGalPage($targetPage);
79
+
80
+#detectPageType($mysPage);
81
+
82
+# if (open PROGRESSFILE, "<$progressFile"){
83
+#     my @file = <PROGRESSFILE>;
84
+#     close PROGRESSFILE;
85
+#     $start = chomp($file[0]);
86
+#     print "Progress File says start at $start \n";
87
+# } else {
88
+#     print "No Progress File found so starting at the begining. \n";
89
+#     my $start = 1;
90
+# }
91
+
92
+#$place = 'http://lcweb2.loc.gov/pp/ahiihtml/ahiisubjindex1.html';
93
+#getPlacePage($place);
94
+
95
+$photo = 'http://lcweb2.loc.gov/cgi-bin/query/I?ils:1:./temp/~pp_GhEo::displayType=1:m856sd=cph:m856sf=3b28121:@@@';
96
+#getImageFromImagePage($photo);
97
+
98
+$dataPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Galatasaray+Mekteb-i+Sultanisi--Buildings--1880-1900+))+@FIELD(COLLID+ahii))';
99
+#getDataPage($dataPage);
100
+
101
+$galleryPage = 'http://lcweb2.loc.gov/cgi-bin/query/S?pp/ils:@FILREQ(@field(SUBJ+@od1(Garden+rooms--Turkey--Istanbul--1880-1900+))+@FIELD(COLLID+ahii))';
102
+#detectPageType($galleryPage);
103
+#detectPageType($dataPage);
104
+
105
+$subjectsPage ='http://lcweb2.loc.gov/pp/ahiihtml/ahiiSubjects04.html';
106
+#getPlacePage($subjectsPage);
107
+
108
+
109
+getSubjectPage($SubjectIndex);
110
+
111
+#getCollections();
112
+
113
+
114
+#no longer being used
115
+sub getCollections{
116
+    print "getCollection \n";
117
+    my $collectionPageUrl = 'http://lcweb2.loc.gov/pp/pphome.html';
118
+    $collectionPage = `curl --retry 30  -s \'$collectionPageUrl\'`;
119
+    $_ = $collectionPage;
120
+    @collectionLinks = /\<A HREF=\".*?\"/gs;
121
+
122
+    foreach(@collectionLinks) {
123
+	$subjectPageUrl = $_;
124
+	$subjectPageUrl =~ s/\<A HREF=\"(.*?)\"/\1/;
125
+	#print $baseURL . "/pp/" . $subjectPageUrl. " ****\n";
126
+
127
+	$subSubUrl = $baseURL . "/pp/" . $subjectPageUrl. "\n";
128
+	$subjectSubpage = `curl --retry 30 -s  \'$subSubUrl\'`;
129
+	
130
+	if($subjectSubpage =~ /Subject.and.format.headings/s){
131
+    	  $subjectSubpage =~ s/.*Browse.*?\<A HREF=\"(.*?)\"\>.*/\1/s ;
132
+	  if (length($subjectSubpage) > 150){
133
+	    #print length($subjectSubpage) . " wrong kind of page2\n";
134
+	  }else {
135
+	      $getpage = $baseURL . "/pp/" . $subjectSubpage;
136
+	      print "!!!!!!!!!  " .$getpage . " !!! \n\n\n";	 
137
+	      getSubjectPage($getpage);
138
+	  }	
139
+      } else {     
140
+	  #print length($subjectSubpage) . " wrong kind of page\n";
141
+      }
142
+    }
143
+
144
+}
145
+
146
+
147
+
148
+
149
+sub getSubjectPage{
150
+    print "getSubjectPage \n";
151
+    my $localSubjectPageUrl = shift;
152
+    $localSubjectPage = `curl --retry 30  -s \'$localSubjectPageUrl\'`;
153
+    $_ = $localSubjectPage;
154
+    @subjectLinks = /From.*?\<a href=\".*?\"\>/gs;
155
+
156
+    $base = $localSubjectPageUrl;
157
+    $base =~ s/(.*)\/.*?$/\1/s ;
158
+
159
+    foreach(@subjectLinks) {
160
+	$dataPageUrl = $_;
161
+	$dataPageUrl =~ s/From.*\<a href=\"(.*?)\"\>/\1/;
162
+	print $base ."/" .$dataPageUrl. "000 \n";
163
+	getPlacePage( $base ."/" .$dataPageUrl);
164
+    }
165
+}
166
+
167
+
168
+sub getPlacePage{
169
+    print "getPlacePage \n";
170
+    my $localPlacePageUrl = shift;
171
+    $localPlacePage = `curl --retry 30  -s \'$localPlacePageUrl\'`;
172
+    $_ = $localPlacePage;
173
+    @placeLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;
174
+
175
+    foreach(@placeLinks) {
176
+	$dataPageUrl = $_;
177
+	$dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;
178
+	print $dataPageUrl;
179
+	detectPageType($baseURL . $dataPageUrl);
180
+    }
181
+
182
+}
183
+
184
+sub detectPageType{
185
+    print "detectPageType \n";
186
+  my $mysteryPageUrl = shift; 
187
+
188
+  #print "FINDING page type for $mysteryPageUrl\n";
189
+  $mysteryPage = `curl --retry 30  -s \'$mysteryPageUrl\'`;
190
+
191
+  if ($mysteryPage =~ /\<title\>Search Results\<.title\>/s){
192
+      getGalPage($mysteryPageUrl);
193
+  } elsif ($mysteryPage =~ /\<title\>\d+?\<.title\>/s){
194
+      getDataPage($mysteryPageUrl);
195
+  } else {
196
+      print "UNKNOWN page type for $mysteryPageUrl \n";
197
+  }
198
+
199
+}
200
+
201
+sub getGalPage{
202
+  my $targetPageUrl = shift; 
203
+
204
+  print "getGalPage $targetPageUrl \n";
205
+  $galleryPage = `curl --retry 30 -s \'$targetPageUrl\'`;
206
+  
207
+  $_ = $galleryPage;
208
+  @galLinks = /\<A HREF=\".cgi-bin.query.*?\"\>/gs;
209
+
210
+  foreach(@galLinks) {
211
+    $dataPageUrl = $_;
212
+    $dataPageUrl =~ s/\<A HREF=\"(.*?)\"\>/\1/;
213
+    #print( "glinks:  ". $baseURL . $dataPageUrl ."\n");
214
+    getDataPage($baseURL . $dataPageUrl);
215
+  }
216
+
217
+  $nextPage = $galleryPage;
218
+  $nextPage =~ s/.*a href\=\"(.*?)\"\>NEXT PAGE.*/\1/s;
219
+  if (length($nextPage) < 90 and length($nextPage) > 1){
220
+      getGalPage($baseURL . $nextPage);
221
+  }
222
+
223
+
224
+}
225
+
226
+sub getDataPage{
227
+    print "getDataPage \n";
228
+  my $subTarget = shift; 
229
+  $subTarget =~ s/\'/%27/g ;
230
+  print "--------runing curl --retry 30 -s \'$subTarget\' \n\n";
231
+  $dataPage = `curl --retry 30  -s \'$subTarget\'`;
232
+  
233
+  #make dups to run regex on
234
+  $imgLink = $dataPage;
235
+  $title = $dataPage;
236
+  $created = $dataPage;
237
+  $notes = $dataPage;
238
+  $subjects = $dataPage;
239
+  $names = $dataPage;
240
+  $medium = $dataPage;
241
+  $callNumber = $dataPage;
242
+  $repository = $dataPage;
243
+  $digitalId = $dataPage;
244
+  
245
+  #find the correct section using regex
246
+  $imgLink =~ s/.*\<A HREF=\"(.*?)\"\>\W?\<IMG SRC.*?\>.*/\1/s;
247
+
248
+  #print $imgLink ."\n";
249
+
250
+  $title =~ s/.*TITLE:(.*?)CALL.*/\1/s;
251
+  $created =~ s/.*CREATED.PUBLISHED:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
252
+  $notes =~ s/.*NOTES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
253
+  $subjects =~ s/.*SUBJECTS:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
254
+  $names =~ s/.*RELATED.NAMES:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
255
+  $medium =~ s/.*MEDIUM:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
256
+  $callNumber =~ s/.*CALL.NUMBER:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
257
+  
258
+  $repository =~ s/.*REPOSITORY:\<.[Bb]\>(.*?)\<[Bb]\>.*/\1/s;
259
+  
260
+  $digitalId =~ s/.*DIGITAL.ID:(.*?)\<P\>.*/\1/s;
261
+  
262
+  #strip html
263
+  $title =~ s/\<[b-zB-Z\/]{1,5}\>|\&nbsp\;|\n|,|\'//g;
264
+  $created =~ s/\<.{1,5}\>|\n|,|\'|\"//g;
265
+  $notes =~ s/\<.{1,5}\>|\n|,|\'|\"//g;
266
+  $subjects =~ s/\<[b-zB-Z\/]{1,5}\>|\n|,|\'//g;
267
+  $names =~ s/\<.{1,5}\>|\n|,|\'//g;
268
+  $medium =~ s/\<.{1,5}\>|\n|,|\'//g;
269
+  $callNumber =~ s/\<.{1,5}\>|\n|,|\'//g;
270
+  $repository =~ s/\<.{1,5}\>|\n|,|\'//g;
271
+  $digitalId =~ s/\<.{1,5}\>|\n|,|\'//g;
272
+
273
+  #make links stop being relative
274
+  $title =~  s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
275
+  $created =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
276
+  $notes =~  s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
277
+  $subjects =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
278
+  $names =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
279
+  $medium =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
280
+  $callNumber =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
281
+  $repository =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
282
+  $digitalId =~ s/=\"\/cgi-bin/=\"$baseURL\/cgi-bin/g;
283
+
284
+  
285
+  #check to make sure that we don't paste a whole page into a field
286
+  if (length($imgLink) > {500}){$imgLink = ''};
287
+  if (length($title) > 1500){$title = ''};
288
+  if (length($created) > 1500){$created = ''};
289
+  if (length($notes ) > 1500){$notes = ''};
290
+  if (length($subjects ) >1500){$subjects = ''};
291
+  if (length($names ) > 1500){$names = ''};
292
+  if (length($medium ) >1500){$medium = ''};
293
+  if (length($callNumber ) > 1500){$callNumber = ''};
294
+  if (length($repository ) > 1500){$repository = ''};
295
+  if (length($digitalId ) > 1500){$digitalId = ''};
296
+  
297
+  #specific request for the first date to be appended to file names
298
+  $firstDate = $created;
299
+  $firstDate =~ s/.*?(\d\d\d\d).*/\1/ ;
300
+  if ($firstDate =~ /.{5}/){$firstDate = ''};
301
+
302
+  #$uniqueMarker = $$; #use the pid if we are forking
303
+  $uniqueMarker = int(rand(9999)); #use the pid if we are forking
304
+
305
+  #set the name the image will be saved under
306
+  $imgName = $title;
307
+  $imgName =~ s/\<.*?\>|\W//g;
308
+  $imgName =~ s/(.{0,23}).*/\1/;
309
+  $imgName = $uniqueMarker. $imgName . "_" . $firstDate;
310
+
311
+
312
+  ##debug only
313
+#     print  "\n title". $title;
314
+#     print  "\n created" . $created;
315
+#     print  "\n notes" . $notes;
316
+#     print  "\n subjects" . $subjects;
317
+#     print  "\n names" . $names;
318
+#     print  "\n medium" . $medium;
319
+#     print  "\n callNumber" . $callNumber;
320
+#     print  "\n repository" . $repository;
321
+#     print  "\n digitalId" . $digitalId;
322
+#     print  "\n writable img name" . $imgName.$imgType;
323
+#   print  "\n";
324
+
325
+
326
+
327
+  if (!$imgLink || length($imgLink) > 100 ){
328
+    print "No downloadable image\n";
329
+    #    die( "$$ Appears to be a Gallery Page\n");
330
+  } else {
331
+    #fetch the image
332
+    getImageFromImagePage($baseURL . $imgLink);
333
+  }
334
+} # close sub getDataPage
335
+
336
+sub getImageFromImagePage{
337
+    print "getImageFromImagePage \n";
338
+    my $imgPageUrl = shift;
339
+    $imgPageUrl =~ s/\'/%27/g ;
340
+    #print $imgPageUrl . "\n";
341
+    $imgPage = `curl --retry 30  -s \'$imgPageUrl\'`;
342
+    $imgPage =~ s/.*\<a href=\"(.*?)\"\>Retrieve.*/\1/s ; 
343
+    if (length($imgPage) > 200 ){
344
+	print "can't find image Link for $imgName \n";
345
+    }
346
+    $imgType = $imgPage;
347
+    $imgType =~ s/.*(\....)$/\1/;
348
+  # Lock and write out data to our output file;
349
+    $lockfile="lock_the_file.loc";
350
+    while (-e $lockfile) {
351
+      print "sleep for $$ \n";
352
+      sleep 2;
353
+    }
354
+    open (LOCK,">$lockfile") || die ("Cannot open lock file!\n");
355
+    close (LOCK);
356
+  
357
+    open(DAT,">>$outputFile") || die("Cannot Open $outputFile ");
358
+    print DAT "\n". $title;
359
+    print DAT ',' . $created;
360
+    print DAT ',' . $notes;
361
+    print DAT ',' . $subjects;
362
+    print DAT ',' . $names;
363
+    print DAT ',' . $medium;
364
+    print DAT ',' . $callNumber;
365
+    print DAT ',' . $repository;
366
+    print DAT ',' . $digitalId;
367
+    print DAT ',' . $imgName.$imgType;
368
+    close(DAT);
369
+  
370
+    my $myCurl = "curl --retry 30 " . $imgPage." -o ".$imagesDir.$imgName.$imgType . "\n";
371
+
372
+    open(CURL,">>$curlStatements") || die("Cannot Open $outputFile ");
373
+    print CURL $myCurl;
374
+    unlink($lockfile);
375
+
376
+    print "\nRUNNING " . $myCurl ;
377
+    system($myCurl);
378
+} # close sub getImageFromImagePage
379
+

Loading…
Cancel
Save