#!/usr/bin/perl

use strict;
use warnings;
no  warnings 'uninitialized';

# 1900

# Log records as they're read?
my $log_reads = 0;

my $pattern = '^[0-9]';

# List of all fields:
my @fields = qw(null dwelling fm lname fname relation race sex birthmonth
    birthyear age marital_status years_married children children_living
    birthplace father_birthplace mother_birthplace year_immi years_us
    naturalized occupation months_unemployed months_school read write eng
    o_r_home f_m_mort farm farm_sch line);

# We'll key on these when looking for twins:
my @match_fields = qw(lname birthyear birthmonth birthplace dwelling);

print "reading STDIN\n" if ($log_reads);

my (@records, $format);
while (my $line = <STDIN>) {
  
  if ($line =~ m/$pattern/)
  {
    # found a person
    my @values = parse_person($line, $format);

    # create a hash reference to store
    # the @values and a copy of $line,
    # then push it onto the stack of records:
    my $n = {};
    @$n{@fields} = (@values, $line);
    push @records, $n;

    if ($log_reads) {
      print ("\tread: $n->{fname} $n->{lname} $n->{age}\n");
    }

  }
  elsif ($line =~ m/\([0-9]{1,2}\)/)
  {
    # we've got a line that indicates field position
    # pull out field indices using left parens, 
    # then convert to a format for unpack
    $format = cut2fmt( find_indices($line, '(') );

    if ($log_reads) {
      print "new format: $format\n";
    }
  }
  elsif ($log_reads)
  {
    print "\tgarbage line\n";
  }

}

# allocate a bunch of keys
my %instances;
keys (%instances) = scalar(@records);

my %lines;
my $record_count = 0;

# count the number of times a given combination of attributes occurs
for my $person (@records) {
  next unless ($person->{lname});

  # crude stoplist to weed out some obvious false positives
  if (
    $person->{lname} =~ m/unoccupied|vacant|\.|^\*$/i
    or $person->{birthyear} =~ m/\.|^\*$/i
    or $person->{relation} =~ m/^head$/i # head:wife relationships
  ) {
    next;
  }

  $record_count++;

  $person->{birthmonth} = substr($person->{birthmonth}, 0, 3);

  if ($person->{birthplace} eq 'ne') {
    $person->{birthplace} = 'nebraska';
  }

  # build a key name:
  my $instance;
  for (@match_fields) {
    $instance .= ":$person->{$_}";
  }

  $instances{$instance}++;

  # accumulate text records
  $lines{$instance} .= $person->{line};
}

# print every instance with more than one entry
my $twin_count;
for ( sort(keys %instances) ) {
  if ($instances{$_} > 1) {
    print "\n$_ = $instances{$_}\n";
    print $lines{$_};
    $twin_count++;
  }
}

print "\n$record_count of " . scalar(@records) . " records compared"
    . "\nfound $twin_count potential multiple-birth groups\n";

###############
# SUBROUTINES #
###############

# take a line of text and a format to unpack into an array
sub parse_person {
  my ($line, $format) = @_;
  my (@fields);  

  @fields = unpack($format, $line);

  # do some formatting on fields

  for (@fields) {
    #s/[,'`;]//g;       # kill some punctuation
    s/^\s+|\s+$//g;     # remove spaces
    $_ = lc($_);        # lowercase
    s/^([a-z]+)\./$1/g; # take care of trailing periods
  }

  return @fields;
}

# from the perl cookbook
# turn a list of columns into a format for unpack
sub cut2fmt {
  my (@positions) = @_;
  my $template = '';
  my $lastpos = 1;
  foreach my $place (@positions) {
    $template .= "A" . ($place - $lastpos) . " ";
    $lastpos = $place;
  }
  $template .= "A*";
  return $template;
}

# pull out field indices using left parens
sub find_indices {
  my ($line, $marker) = @_;

  my (@indices);
  my $index = 0;

  while ($index < length($line)) {
    $index++;
    if ( substr($line, $index, 1) eq "(" ) {
      push (@indices, $index);
    }
  }

  return @indices;
}