brennen
/
twinpairs


								#!/usr/bin/perl


								use strict;

								use warnings;

								no  warnings 'uninitialized';


								# 1900


								# Log records as they're read?

								my $log_reads = 0;


								my $pattern = '^[0-9]';


								# List of all fields:

								my @fields = qw(null dwelling fm lname fname relation race sex birthmonth

								    birthyear age marital_status years_married children children_living

								    birthplace father_birthplace mother_birthplace year_immi years_us

								    naturalized occupation months_unemployed months_school read write eng

								    o_r_home f_m_mort farm farm_sch line);


								# We'll key on these when looking for twins:

								my @match_fields = qw(lname birthyear birthmonth birthplace dwelling);


								print "reading STDIN\n" if ($log_reads);


								my (@records, $format);

								while (my $line = <STDIN>) {


								  if ($line =~ m/$pattern/)

								  {

								    # found a person

								    my @values = parse_person($line, $format);


								    # create a hash reference to store

								    # the @values and a copy of $line,

								    # then push it onto the stack of records:

								    my $n = {};

								    @$n{@fields} = (@values, $line);

								    push @records, $n;


								    if ($log_reads) {

								      print ("\tread: $n->{fname} $n->{lname} $n->{age}\n");

								    }


								  }

								  elsif ($line =~ m/\([0-9]{1,2}\)/)

								  {

								    # we've got a line that indicates field position

								    # pull out field indices using left parens,

								    # then convert to a format for unpack

								    $format = cut2fmt( find_indices($line, '(') );


								    if ($log_reads) {

								      print "new format: $format\n";

								    }

								  }

								  elsif ($log_reads)

								  {

								    print "\tgarbage line\n";

								  }


								}


								# allocate a bunch of keys

								my %instances;

								keys (%instances) = scalar(@records);


								my %lines;

								my $record_count = 0;


								# count the number of times a given combination of attributes occurs

								for my $person (@records) {

								  next unless ($person->{lname});


								  # crude stoplist to weed out some obvious false positives

								  if (

								    $person->{lname} =~ m/unoccupied|vacant|\.|^\*$/i

								    or $person->{birthyear} =~ m/\.|^\*$/i

								    or $person->{relation} =~ m/^head$/i # head:wife relationships

								  ) {

								    next;

								  }


								  $record_count++;


								  $person->{birthmonth} = substr($person->{birthmonth}, 0, 3);


								  if ($person->{birthplace} eq 'ne') {

								    $person->{birthplace} = 'nebraska';

								  }


								  # build a key name:

								  my $instance;

								  for (@match_fields) {

								    $instance .= ":$person->{$_}";

								  }


								  $instances{$instance}++;


								  # accumulate text records

								  $lines{$instance} .= $person->{line};

								}


								# print every instance with more than one entry

								my $twin_count;

								for ( sort(keys %instances) ) {

								  if ($instances{$_} > 1) {

								    print "\n$_ = $instances{$_}\n";

								    print $lines{$_};

								    $twin_count++;

								  }

								}


								print "\n$record_count of " . scalar(@records) . " records compared"

								    . "\nfound $twin_count potential multiple-birth groups\n";


								###############

								# SUBROUTINES #

								###############


								# take a line of text and a format to unpack into an array

								sub parse_person {

								  my ($line, $format) = @_;

								  my (@fields);


								  @fields = unpack($format, $line);


								  # do some formatting on fields


								  for (@fields) {

								    #s/[,'`;]//g;       # kill some punctuation

								    s/^\s+|\s+$//g;     # remove spaces

								    $_ = lc($_);        # lowercase

								    s/^([a-z]+)\./$1/g; # take care of trailing periods

								  }


								  return @fields;

								}


								# from the perl cookbook

								# turn a list of columns into a format for unpack

								sub cut2fmt {

								  my (@positions) = @_;

								  my $template = '';

								  my $lastpos = 1;

								  foreach my $place (@positions) {

								    $template .= "A" . ($place - $lastpos) . " ";

								    $lastpos = $place;

								  }

								  $template .= "A*";

								  return $template;

								}


								# pull out field indices using left parens

								sub find_indices {

								  my ($line, $marker) = @_;


								  my (@indices);

								  my $index = 0;


								  while ($index < length($line)) {

								    $index++;

								    if ( substr($line, $index, 1) eq "(" ) {

								      push (@indices, $index);

								    }

								  }


								  return @indices;

								}