#!/usr/bin/perl use strict; use warnings; no warnings 'uninitialized'; # 1900 # Log records as they're read? my $log_reads = 0; my $pattern = '^[0-9]'; # List of all fields: my @fields = qw(null dwelling fm lname fname relation race sex birthmonth birthyear age marital_status years_married children children_living birthplace father_birthplace mother_birthplace year_immi years_us naturalized occupation months_unemployed months_school read write eng o_r_home f_m_mort farm farm_sch line); # We'll key on these when looking for twins: my @match_fields = qw(lname birthyear birthmonth birthplace dwelling); print "reading STDIN\n" if ($log_reads); my (@records, $format); while (my $line = ) { if ($line =~ m/$pattern/) { # found a person my @values = parse_person($line, $format); # create a hash reference to store # the @values and a copy of $line, # then push it onto the stack of records: my $n = {}; @$n{@fields} = (@values, $line); push @records, $n; if ($log_reads) { print ("\tread: $n->{fname} $n->{lname} $n->{age}\n"); } } elsif ($line =~ m/\([0-9]{1,2}\)/) { # we've got a line that indicates field position # pull out field indices using left parens, # then convert to a format for unpack $format = cut2fmt( find_indices($line, '(') ); if ($log_reads) { print "new format: $format\n"; } } elsif ($log_reads) { print "\tgarbage line\n"; } } # allocate a bunch of keys my %instances; keys (%instances) = scalar(@records); my %lines; my $record_count = 0; # count the number of times a given combination of attributes occurs for my $person (@records) { next unless ($person->{lname}); # crude stoplist to weed out some obvious false positives if ( $person->{lname} =~ m/unoccupied|vacant|\.|^\*$/i or $person->{birthyear} =~ m/\.|^\*$/i or $person->{relation} =~ m/^head$/i # head:wife relationships ) { next; } $record_count++; $person->{birthmonth} = substr($person->{birthmonth}, 0, 3); if ($person->{birthplace} eq 'ne') { $person->{birthplace} = 'nebraska'; } # build a key name: my $instance; for (@match_fields) { $instance .= ":$person->{$_}"; } $instances{$instance}++; # accumulate text records $lines{$instance} .= $person->{line}; } # print every instance with more than one entry my $twin_count; for ( sort(keys %instances) ) { if ($instances{$_} > 1) { print "\n$_ = $instances{$_}\n"; print $lines{$_}; $twin_count++; } } print "\n$record_count of " . scalar(@records) . " records compared" . "\nfound $twin_count potential multiple-birth groups\n"; ############### # SUBROUTINES # ############### # take a line of text and a format to unpack into an array sub parse_person { my ($line, $format) = @_; my (@fields); @fields = unpack($format, $line); # do some formatting on fields for (@fields) { #s/[,'`;]//g; # kill some punctuation s/^\s+|\s+$//g; # remove spaces $_ = lc($_); # lowercase s/^([a-z]+)\./$1/g; # take care of trailing periods } return @fields; } # from the perl cookbook # turn a list of columns into a format for unpack sub cut2fmt { my (@positions) = @_; my $template = ''; my $lastpos = 1; foreach my $place (@positions) { $template .= "A" . ($place - $lastpos) . " "; $lastpos = $place; } $template .= "A*"; return $template; } # pull out field indices using left parens sub find_indices { my ($line, $marker) = @_; my (@indices); my $index = 0; while ($index < length($line)) { $index++; if ( substr($line, $index, 1) eq "(" ) { push (@indices, $index); } } return @indices; }