#!/usr/bin/env perl
							 | 
						|
								
							 | 
						|
								# For a sort of rebuttal of this program, see:
							 | 
						|
								# http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/
							 | 
						|
								
							 | 
						|
								use strict;
							 | 
						|
								use warnings;
							 | 
						|
								use 5.10.0;
							 | 
						|
								use Getopt::Std;
							 | 
						|
								
							 | 
						|
								our ($opt_h, $opt_i, $opt_u, $opt_a, $opt_c, $opt_b, $opt_s, $opt_d) = (0, 0, 0, 0, 0, 0, 0, '\s+');
							 | 
						|
								getopts('huacib:s:d:');
							 | 
						|
								
							 | 
						|
								# documentate
							 | 
						|
								if ($opt_h) {
							 | 
						|
								  print <<'HELP';
							 | 
						|
								Usage: words [-ucaih] [-s n] [-b n] [-d pattern] [file]
							 | 
						|
								Split input into individual words, crudely understood.
							 | 
						|
								
							 | 
						|
								    -u:  print each unique word only once
							 | 
						|
								    -c:  print a count of words and exit
							 | 
						|
								    -uc: print a count for each unique word
							 | 
						|
								    -a:  strip non-alphanumeric characters for current locale
							 | 
						|
								    -i:  coerce all to lowercase, ignore case when considering duplicates
							 | 
						|
								    -h:  print this help and exit
							 | 
						|
								
							 | 
						|
								    -s n, -b n: (s)hortest and (b)iggest words to pass through
							 | 
						|
								    -d pattern: word delimiter (a Perl regexp)
							 | 
						|
								
							 | 
						|
								If no file is given, standard input will be read instead.
							 | 
						|
								
							 | 
						|
								Examples:
							 | 
						|
								
							 | 
						|
								   # list all unique words, ignoring case, in foo:
							 | 
						|
								   words -ui ./foo
							 | 
						|
								
							 | 
						|
								   # find ten most used words longer than 6 letters in foo:
							 | 
						|
								   words -uci -s6 foo | sort -nr | head -10
							 | 
						|
								
							 | 
						|
								HELP
							 | 
						|
								exit;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								if ($opt_b && ($opt_b < $opt_s)) {
							 | 
						|
								  say "illogical length constraint:\n"
							 | 
						|
								    . "  [-b]iggest of $opt_b can't be less than [-s]hortest of $opt_s";
							 | 
						|
								  exit 1;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								my $everything;
							 | 
						|
								{
							 | 
						|
								  local $/ = undef; # unset record separator
							 | 
						|
								  while (<>) {
							 | 
						|
								    # one file per iteration, if we're processing filenames
							 | 
						|
								    $everything .= $_;
							 | 
						|
								  }
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								if (! defined $everything) {
							 | 
						|
								  exit 2;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								if ($opt_a) {
							 | 
						|
								  # strip non-alphanumeric characters for current locale:
							 | 
						|
								  $everything =~ s/[^[:alnum:]]/ /gx;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								# chop into individual "words" by our delimiter
							 | 
						|
								# (defaults to any amount of whitespace):
							 | 
						|
								my %words;
							 | 
						|
								my $wordcount = 0;
							 | 
						|
								foreach my $word (split /$opt_d/, $everything) {
							 | 
						|
								  if ($opt_i) {
							 | 
						|
								    $word = lc($word);
							 | 
						|
								  }
							 | 
						|
								
							 | 
						|
								  # don't allow bigger than -b or shorter than -s letters
							 | 
						|
								  if ($opt_b) { next if length($word) > $opt_b; }
							 | 
						|
								  if ($opt_s) { next if length($word) < $opt_s; }
							 | 
						|
								
							 | 
						|
								  # skip non-unique words
							 | 
						|
								  if ($opt_u) { next if ( $words{$word}++ ) > 0; }
							 | 
						|
								
							 | 
						|
								  say $word unless $opt_c;
							 | 
						|
								
							 | 
						|
								  $wordcount++;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								# we've been asked to give a count for each unique word
							 | 
						|
								if ($opt_u && $opt_c) {
							 | 
						|
								  foreach my $word (keys %words) {
							 | 
						|
								    say "$words{$word}\t$word";
							 | 
						|
								  }
							 | 
						|
								} elsif ($opt_c) {
							 | 
						|
								  # wc(1) stylee:
							 | 
						|
								  say " $wordcount";
							 | 
						|
								}
							 |