Some hacky Perl to find twin pairs in census transcriptions
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

169 lines
3.7 KiB

#!/usr/bin/perl
use strict;
use warnings;
no warnings 'uninitialized';
# 1900
# Log records as they're read?
my $log_reads = 0;
my $pattern = '^[0-9]';
# List of all fields:
my @fields = qw(null dwelling fm lname fname relation race sex birthmonth
birthyear age marital_status years_married children children_living
birthplace father_birthplace mother_birthplace year_immi years_us
naturalized occupation months_unemployed months_school read write eng
o_r_home f_m_mort farm farm_sch line);
# We'll key on these when looking for twins:
my @match_fields = qw(lname birthyear birthmonth birthplace dwelling);
print "reading STDIN\n" if ($log_reads);
my (@records, $format);
while (my $line = <STDIN>) {
if ($line =~ m/$pattern/)
{
# found a person
my @values = parse_person($line, $format);
# create a hash reference to store
# the @values and a copy of $line,
# then push it onto the stack of records:
my $n = {};
@$n{@fields} = (@values, $line);
push @records, $n;
if ($log_reads) {
print ("\tread: $n->{fname} $n->{lname} $n->{age}\n");
}
}
elsif ($line =~ m/\([0-9]{1,2}\)/)
{
# we've got a line that indicates field position
# pull out field indices using left parens,
# then convert to a format for unpack
$format = cut2fmt( find_indices($line, '(') );
if ($log_reads) {
print "new format: $format\n";
}
}
elsif ($log_reads)
{
print "\tgarbage line\n";
}
}
# allocate a bunch of keys
my %instances;
keys (%instances) = scalar(@records);
my %lines;
my $record_count = 0;
# count the number of times a given combination of attributes occurs
for my $person (@records) {
next unless ($person->{lname});
# crude stoplist to weed out some obvious false positives
if (
$person->{lname} =~ m/unoccupied|vacant|\.|^\*$/i
or $person->{birthyear} =~ m/\.|^\*$/i
or $person->{relation} =~ m/^head$/i # head:wife relationships
) {
next;
}
$record_count++;
$person->{birthmonth} = substr($person->{birthmonth}, 0, 3);
if ($person->{birthplace} eq 'ne') {
$person->{birthplace} = 'nebraska';
}
# build a key name:
my $instance;
for (@match_fields) {
$instance .= ":$person->{$_}";
}
$instances{$instance}++;
# accumulate text records
$lines{$instance} .= $person->{line};
}
# print every instance with more than one entry
my $twin_count;
for ( sort(keys %instances) ) {
if ($instances{$_} > 1) {
print "\n$_ = $instances{$_}\n";
print $lines{$_};
$twin_count++;
}
}
print "\n$record_count of " . scalar(@records) . " records compared"
. "\nfound $twin_count potential multiple-birth groups\n";
###############
# SUBROUTINES #
###############
# take a line of text and a format to unpack into an array
sub parse_person {
my ($line, $format) = @_;
my (@fields);
@fields = unpack($format, $line);
# do some formatting on fields
for (@fields) {
#s/[,'`;]//g; # kill some punctuation
s/^\s+|\s+$//g; # remove spaces
$_ = lc($_); # lowercase
s/^([a-z]+)\./$1/g; # take care of trailing periods
}
return @fields;
}
# from the perl cookbook
# turn a list of columns into a format for unpack
sub cut2fmt {
my (@positions) = @_;
my $template = '';
my $lastpos = 1;
foreach my $place (@positions) {
$template .= "A" . ($place - $lastpos) . " ";
$lastpos = $place;
}
$template .= "A*";
return $template;
}
# pull out field indices using left parens
sub find_indices {
my ($line, $marker) = @_;
my (@indices);
my $index = 0;
while ($index < length($line)) {
$index++;
if ( substr($line, $index, 1) eq "(" ) {
push (@indices, $index);
}
}
return @indices;
}