#!/usr/bin/env perl # For a sort of rebuttal of this program, see: # http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/ use strict; use warnings; use 5.10.0; use Getopt::Std; our ($opt_h, $opt_i, $opt_u, $opt_a, $opt_c, $opt_b, $opt_s, $opt_d) = (0, 0, 0, 0, 0, 0, 0, '\s+'); getopts('huacib:s:d:'); # documentate if ($opt_h) { print <<'HELP'; Usage: words [-ucaih] [-s n] [-b n] [-d pattern] [file] Split input into individual words, crudely understood. -u: print each unique word only once -c: print a count of words and exit -uc: print a count for each unique word -a: strip non-alphanumeric characters for current locale -i: coerce all to lowercase, ignore case when considering duplicates -h: print this help and exit -s n, -b n: (s)hortest and (b)iggest words to pass through -d pattern: word delimiter (a Perl regexp) If no file is given, standard input will be read instead. Examples: # list all unique words, ignoring case, in foo: words -ui ./foo # find ten most used words longer than 6 letters in foo: words -uci -s6 foo | sort -nr | head -10 HELP exit; } if ($opt_b && ($opt_b < $opt_s)) { say "illogical length constraint:\n" . " [-b]iggest of $opt_b can't be less than [-s]hortest of $opt_s"; exit 1; } my $everything; { local $/ = undef; # unset record separator while (<>) { # one file per iteration, if we're processing filenames $everything .= $_; } } if (! defined $everything) { exit 2; } if ($opt_a) { # strip non-alphanumeric characters for current locale: $everything =~ s/[^[:alnum:]]/ /gx; } # chop into individual "words" by our delimiter # (defaults to any amount of whitespace): my %words; my $wordcount = 0; foreach my $word (split /$opt_d/, $everything) { if ($opt_i) { $word = lc($word); } # don't allow bigger than -b or shorter than -s letters if ($opt_b) { next if length($word) > $opt_b; } if ($opt_s) { next if length($word) < $opt_s; } # skip non-unique words if ($opt_u) { next if ( $words{$word}++ ) > 0; } say $word unless $opt_c; $wordcount++; } # we've been asked to give a count for each unique word if ($opt_u && $opt_c) { foreach my $word (keys %words) { say "$words{$word}\t$word"; } } elsif ($opt_c) { # wc(1) stylee: say " $wordcount"; }