#!/usr/bin/perl
|
|
|
|
use strict;
|
|
use warnings;
|
|
no warnings 'uninitialized';
|
|
|
|
# 1900
|
|
|
|
# Log records as they're read?
|
|
my $log_reads = 0;
|
|
|
|
my $pattern = '^[0-9]';
|
|
|
|
# List of all fields:
|
|
my @fields = qw(null dwelling fm lname fname relation race sex birthmonth
|
|
birthyear age marital_status years_married children children_living
|
|
birthplace father_birthplace mother_birthplace year_immi years_us
|
|
naturalized occupation months_unemployed months_school read write eng
|
|
o_r_home f_m_mort farm farm_sch line);
|
|
|
|
# We'll key on these when looking for twins:
|
|
my @match_fields = qw(lname birthyear birthmonth birthplace dwelling);
|
|
|
|
print "reading STDIN\n" if ($log_reads);
|
|
|
|
my (@records, $format);
|
|
while (my $line = <STDIN>) {
|
|
|
|
if ($line =~ m/$pattern/)
|
|
{
|
|
# found a person
|
|
my @values = parse_person($line, $format);
|
|
|
|
# create a hash reference to store
|
|
# the @values and a copy of $line,
|
|
# then push it onto the stack of records:
|
|
my $n = {};
|
|
@$n{@fields} = (@values, $line);
|
|
push @records, $n;
|
|
|
|
if ($log_reads) {
|
|
print ("\tread: $n->{fname} $n->{lname} $n->{age}\n");
|
|
}
|
|
|
|
}
|
|
elsif ($line =~ m/\([0-9]{1,2}\)/)
|
|
{
|
|
# we've got a line that indicates field position
|
|
# pull out field indices using left parens,
|
|
# then convert to a format for unpack
|
|
$format = cut2fmt( find_indices($line, '(') );
|
|
|
|
if ($log_reads) {
|
|
print "new format: $format\n";
|
|
}
|
|
}
|
|
elsif ($log_reads)
|
|
{
|
|
print "\tgarbage line\n";
|
|
}
|
|
|
|
}
|
|
|
|
# allocate a bunch of keys
|
|
my %instances;
|
|
keys (%instances) = scalar(@records);
|
|
|
|
my %lines;
|
|
my $record_count = 0;
|
|
|
|
# count the number of times a given combination of attributes occurs
|
|
for my $person (@records) {
|
|
next unless ($person->{lname});
|
|
|
|
# crude stoplist to weed out some obvious false positives
|
|
if (
|
|
$person->{lname} =~ m/unoccupied|vacant|\.|^\*$/i
|
|
or $person->{birthyear} =~ m/\.|^\*$/i
|
|
or $person->{relation} =~ m/^head$/i # head:wife relationships
|
|
) {
|
|
next;
|
|
}
|
|
|
|
$record_count++;
|
|
|
|
$person->{birthmonth} = substr($person->{birthmonth}, 0, 3);
|
|
|
|
if ($person->{birthplace} eq 'ne') {
|
|
$person->{birthplace} = 'nebraska';
|
|
}
|
|
|
|
# build a key name:
|
|
my $instance;
|
|
for (@match_fields) {
|
|
$instance .= ":$person->{$_}";
|
|
}
|
|
|
|
$instances{$instance}++;
|
|
|
|
# accumulate text records
|
|
$lines{$instance} .= $person->{line};
|
|
}
|
|
|
|
# print every instance with more than one entry
|
|
my $twin_count;
|
|
for ( sort(keys %instances) ) {
|
|
if ($instances{$_} > 1) {
|
|
print "\n$_ = $instances{$_}\n";
|
|
print $lines{$_};
|
|
$twin_count++;
|
|
}
|
|
}
|
|
|
|
print "\n$record_count of " . scalar(@records) . " records compared"
|
|
. "\nfound $twin_count potential multiple-birth groups\n";
|
|
|
|
###############
|
|
# SUBROUTINES #
|
|
###############
|
|
|
|
# take a line of text and a format to unpack into an array
|
|
sub parse_person {
|
|
my ($line, $format) = @_;
|
|
my (@fields);
|
|
|
|
@fields = unpack($format, $line);
|
|
|
|
# do some formatting on fields
|
|
|
|
for (@fields) {
|
|
#s/[,'`;]//g; # kill some punctuation
|
|
s/^\s+|\s+$//g; # remove spaces
|
|
$_ = lc($_); # lowercase
|
|
s/^([a-z]+)\./$1/g; # take care of trailing periods
|
|
}
|
|
|
|
return @fields;
|
|
}
|
|
|
|
# from the perl cookbook
|
|
# turn a list of columns into a format for unpack
|
|
sub cut2fmt {
|
|
my (@positions) = @_;
|
|
my $template = '';
|
|
my $lastpos = 1;
|
|
foreach my $place (@positions) {
|
|
$template .= "A" . ($place - $lastpos) . " ";
|
|
$lastpos = $place;
|
|
}
|
|
$template .= "A*";
|
|
return $template;
|
|
}
|
|
|
|
# pull out field indices using left parens
|
|
sub find_indices {
|
|
my ($line, $marker) = @_;
|
|
|
|
my (@indices);
|
|
my $index = 0;
|
|
|
|
while ($index < length($line)) {
|
|
$index++;
|
|
if ( substr($line, $index, 1) eq "(" ) {
|
|
push (@indices, $index);
|
|
}
|
|
}
|
|
|
|
return @indices;
|
|
}
|