brennen
/
twinpairs

#!/usr/bin/perl
use strict;use warnings;no  warnings 'uninitialized';
# 1900
# Log records as they're read?my $log_reads = 0;
my $pattern = '^[0-9]';
# List of all fields:my @fields = qw(null dwelling fm lname fname relation race sex birthmonth
    birthyear age marital_status years_married children children_living    birthplace father_birthplace mother_birthplace year_immi years_us    naturalized occupation months_unemployed months_school read write eng    o_r_home f_m_mort farm farm_sch line);
# We'll key on these when looking for twins:my @match_fields = qw(lname birthyear birthmonth birthplace dwelling);
print "reading STDIN\n" if ($log_reads);
my (@records, $format);while (my $line = <STDIN>) {    if ($line =~ m/$pattern/)  {    # found a person    my @values = parse_person($line, $format);
    # create a hash reference to store    # the @values and a copy of $line,    # then push it onto the stack of records:    my $n = {};    @$n{@fields} = (@values, $line);    push @records, $n;
    if ($log_reads) {      print ("\tread: $n->{fname} $n->{lname} $n->{age}\n");    }
  }  elsif ($line =~ m/\([0-9]{1,2}\)/)  {    # we've got a line that indicates field position    # pull out field indices using left parens,     # then convert to a format for unpack    $format = cut2fmt( find_indices($line, '(') );
    if ($log_reads) {      print "new format: $format\n";    }  }  elsif ($log_reads)  {    print "\tgarbage line\n";  }
}
# allocate a bunch of keysmy %instances;keys (%instances) = scalar(@records);
my %lines;my $record_count = 0;
# count the number of times a given combination of attributes occursfor my $person (@records) {  next unless ($person->{lname});
  # crude stoplist to weed out some obvious false positives  if (    $person->{lname} =~ m/unoccupied|vacant|\.|^\*$/i    or $person->{birthyear} =~ m/\.|^\*$/i    or $person->{relation} =~ m/^head$/i # head:wife relationships  ) {    next;  }
  $record_count++;
  $person->{birthmonth} = substr($person->{birthmonth}, 0, 3);
  if ($person->{birthplace} eq 'ne') {    $person->{birthplace} = 'nebraska';  }
  # build a key name:  my $instance;  for (@match_fields) {    $instance .= ":$person->{$_}";  }
  $instances{$instance}++;
  # accumulate text records  $lines{$instance} .= $person->{line};}
# print every instance with more than one entrymy $twin_count;for ( sort(keys %instances) ) {  if ($instances{$_} > 1) {    print "\n$_ = $instances{$_}\n";    print $lines{$_};    $twin_count++;  }}
print "\n$record_count of " . scalar(@records) . " records compared"    . "\nfound $twin_count potential multiple-birth groups\n";
################ SUBROUTINES ################
# take a line of text and a format to unpack into an arraysub parse_person {  my ($line, $format) = @_;  my (@fields);  
  @fields = unpack($format, $line);
  # do some formatting on fields
  for (@fields) {    #s/[,'`;]//g;       # kill some punctuation    s/^\s+|\s+$//g;     # remove spaces    $_ = lc($_);        # lowercase    s/^([a-z]+)\./$1/g; # take care of trailing periods  }
  return @fields;}
# from the perl cookbook# turn a list of columns into a format for unpacksub cut2fmt {  my (@positions) = @_;  my $template = '';  my $lastpos = 1;  foreach my $place (@positions) {    $template .= "A" . ($place - $lastpos) . " ";    $lastpos = $place;  }  $template .= "A*";  return $template;}
# pull out field indices using left parenssub find_indices {  my ($line, $marker) = @_;
  my (@indices);  my $index = 0;
  while ($index < length($line)) {    $index++;    if ( substr($line, $index, 1) eq "(" ) {      push (@indices, $index);    }  }
  return @indices;}