|
|
- #!/usr/bin/perl
-
- use strict;
- use warnings;
- no warnings 'uninitialized';
-
- # 1900
-
- # Log records as they're read?
- my $log_reads = 0;
-
- my $pattern = '^[0-9]';
-
- # List of all fields:
- my @fields = qw(null dwelling fm lname fname relation race sex birthmonth
- birthyear age marital_status years_married children children_living
- birthplace father_birthplace mother_birthplace year_immi years_us
- naturalized occupation months_unemployed months_school read write eng
- o_r_home f_m_mort farm farm_sch line);
-
- # We'll key on these when looking for twins:
- my @match_fields = qw(lname birthyear birthmonth birthplace dwelling);
-
- print "reading STDIN\n" if ($log_reads);
-
- my (@records, $format);
- while (my $line = <STDIN>) {
-
- if ($line =~ m/$pattern/)
- {
- # found a person
- my @values = parse_person($line, $format);
-
- # create a hash reference to store
- # the @values and a copy of $line,
- # then push it onto the stack of records:
- my $n = {};
- @$n{@fields} = (@values, $line);
- push @records, $n;
-
- if ($log_reads) {
- print ("\tread: $n->{fname} $n->{lname} $n->{age}\n");
- }
-
- }
- elsif ($line =~ m/\([0-9]{1,2}\)/)
- {
- # we've got a line that indicates field position
- # pull out field indices using left parens,
- # then convert to a format for unpack
- $format = cut2fmt( find_indices($line, '(') );
-
- if ($log_reads) {
- print "new format: $format\n";
- }
- }
- elsif ($log_reads)
- {
- print "\tgarbage line\n";
- }
-
- }
-
- # allocate a bunch of keys
- my %instances;
- keys (%instances) = scalar(@records);
-
- my %lines;
- my $record_count = 0;
-
- # count the number of times a given combination of attributes occurs
- for my $person (@records) {
- next unless ($person->{lname});
-
- # crude stoplist to weed out some obvious false positives
- if (
- $person->{lname} =~ m/unoccupied|vacant|\.|^\*$/i
- or $person->{birthyear} =~ m/\.|^\*$/i
- or $person->{relation} =~ m/^head$/i # head:wife relationships
- ) {
- next;
- }
-
- $record_count++;
-
- $person->{birthmonth} = substr($person->{birthmonth}, 0, 3);
-
- if ($person->{birthplace} eq 'ne') {
- $person->{birthplace} = 'nebraska';
- }
-
- # build a key name:
- my $instance;
- for (@match_fields) {
- $instance .= ":$person->{$_}";
- }
-
- $instances{$instance}++;
-
- # accumulate text records
- $lines{$instance} .= $person->{line};
- }
-
- # print every instance with more than one entry
- my $twin_count;
- for ( sort(keys %instances) ) {
- if ($instances{$_} > 1) {
- print "\n$_ = $instances{$_}\n";
- print $lines{$_};
- $twin_count++;
- }
- }
-
- print "\n$record_count of " . scalar(@records) . " records compared"
- . "\nfound $twin_count potential multiple-birth groups\n";
-
- ###############
- # SUBROUTINES #
- ###############
-
- # take a line of text and a format to unpack into an array
- sub parse_person {
- my ($line, $format) = @_;
- my (@fields);
-
- @fields = unpack($format, $line);
-
- # do some formatting on fields
-
- for (@fields) {
- #s/[,'`;]//g; # kill some punctuation
- s/^\s+|\s+$//g; # remove spaces
- $_ = lc($_); # lowercase
- s/^([a-z]+)\./$1/g; # take care of trailing periods
- }
-
- return @fields;
- }
-
- # from the perl cookbook
- # turn a list of columns into a format for unpack
- sub cut2fmt {
- my (@positions) = @_;
- my $template = '';
- my $lastpos = 1;
- foreach my $place (@positions) {
- $template .= "A" . ($place - $lastpos) . " ";
- $lastpos = $place;
- }
- $template .= "A*";
- return $template;
- }
-
- # pull out field indices using left parens
- sub find_indices {
- my ($line, $marker) = @_;
-
- my (@indices);
- my $index = 0;
-
- while ($index < length($line)) {
- $index++;
- if ( substr($line, $index, 1) eq "(" ) {
- push (@indices, $index);
- }
- }
-
- return @indices;
- }
|