Some hacky Perl to find twin pairs in census transcriptions
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

twins1900.pl 3.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;
  4. no warnings 'uninitialized';
  5. # 1900
  6. # Log records as they're read?
  7. my $log_reads = 0;
  8. my $pattern = '^[0-9]';
  9. # List of all fields:
  10. my @fields = qw(null dwelling fm lname fname relation race sex birthmonth
  11. birthyear age marital_status years_married children children_living
  12. birthplace father_birthplace mother_birthplace year_immi years_us
  13. naturalized occupation months_unemployed months_school read write eng
  14. o_r_home f_m_mort farm farm_sch line);
  15. # We'll key on these when looking for twins:
  16. my @match_fields = qw(lname birthyear birthmonth birthplace dwelling);
  17. print "reading STDIN\n" if ($log_reads);
  18. my (@records, $format);
  19. while (my $line = <STDIN>) {
  20. if ($line =~ m/$pattern/)
  21. {
  22. # found a person
  23. my @values = parse_person($line, $format);
  24. # create a hash reference to store
  25. # the @values and a copy of $line,
  26. # then push it onto the stack of records:
  27. my $n = {};
  28. @$n{@fields} = (@values, $line);
  29. push @records, $n;
  30. if ($log_reads) {
  31. print ("\tread: $n->{fname} $n->{lname} $n->{age}\n");
  32. }
  33. }
  34. elsif ($line =~ m/\([0-9]{1,2}\)/)
  35. {
  36. # we've got a line that indicates field position
  37. # pull out field indices using left parens,
  38. # then convert to a format for unpack
  39. $format = cut2fmt( find_indices($line, '(') );
  40. if ($log_reads) {
  41. print "new format: $format\n";
  42. }
  43. }
  44. elsif ($log_reads)
  45. {
  46. print "\tgarbage line\n";
  47. }
  48. }
  49. # allocate a bunch of keys
  50. my %instances;
  51. keys (%instances) = scalar(@records);
  52. my %lines;
  53. my $record_count = 0;
  54. # count the number of times a given combination of attributes occurs
  55. for my $person (@records) {
  56. next unless ($person->{lname});
  57. # crude stoplist to weed out some obvious false positives
  58. if (
  59. $person->{lname} =~ m/unoccupied|vacant|\.|^\*$/i
  60. or $person->{birthyear} =~ m/\.|^\*$/i
  61. or $person->{relation} =~ m/^head$/i # head:wife relationships
  62. ) {
  63. next;
  64. }
  65. $record_count++;
  66. $person->{birthmonth} = substr($person->{birthmonth}, 0, 3);
  67. if ($person->{birthplace} eq 'ne') {
  68. $person->{birthplace} = 'nebraska';
  69. }
  70. # build a key name:
  71. my $instance;
  72. for (@match_fields) {
  73. $instance .= ":$person->{$_}";
  74. }
  75. $instances{$instance}++;
  76. # accumulate text records
  77. $lines{$instance} .= $person->{line};
  78. }
  79. # print every instance with more than one entry
  80. my $twin_count;
  81. for ( sort(keys %instances) ) {
  82. if ($instances{$_} > 1) {
  83. print "\n$_ = $instances{$_}\n";
  84. print $lines{$_};
  85. $twin_count++;
  86. }
  87. }
  88. print "\n$record_count of " . scalar(@records) . " records compared"
  89. . "\nfound $twin_count potential multiple-birth groups\n";
  90. ###############
  91. # SUBROUTINES #
  92. ###############
  93. # take a line of text and a format to unpack into an array
  94. sub parse_person {
  95. my ($line, $format) = @_;
  96. my (@fields);
  97. @fields = unpack($format, $line);
  98. # do some formatting on fields
  99. for (@fields) {
  100. #s/[,'`;]//g; # kill some punctuation
  101. s/^\s+|\s+$//g; # remove spaces
  102. $_ = lc($_); # lowercase
  103. s/^([a-z]+)\./$1/g; # take care of trailing periods
  104. }
  105. return @fields;
  106. }
  107. # from the perl cookbook
  108. # turn a list of columns into a format for unpack
  109. sub cut2fmt {
  110. my (@positions) = @_;
  111. my $template = '';
  112. my $lastpos = 1;
  113. foreach my $place (@positions) {
  114. $template .= "A" . ($place - $lastpos) . " ";
  115. $lastpos = $place;
  116. }
  117. $template .= "A*";
  118. return $template;
  119. }
  120. # pull out field indices using left parens
  121. sub find_indices {
  122. my ($line, $marker) = @_;
  123. my (@indices);
  124. my $index = 0;
  125. while ($index < length($line)) {
  126. $index++;
  127. if ( substr($line, $index, 1) eq "(" ) {
  128. push (@indices, $index);
  129. }
  130. }
  131. return @indices;
  132. }