|
|
- #!/usr/bin/env perl
-
- # For a sort of rebuttal of this program, see:
- # http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/
-
- use strict;
- use warnings;
- use 5.10.0;
- use Getopt::Std;
-
- our ($opt_h, $opt_i, $opt_u, $opt_a, $opt_c, $opt_b, $opt_s, $opt_d) = (0, 0, 0, 0, 0, 0, 0, '\s+');
- getopts('huacib:s:d:');
-
- # documentate
- if ($opt_h) {
- print <<'HELP';
- Usage: words [-ucaih] [-s n] [-b n] [-d pattern] [file]
- Split input into individual words, crudely understood.
-
- -u: print each unique word only once
- -c: print a count of words and exit
- -uc: print a count for each unique word
- -a: strip non-alphanumeric characters for current locale
- -i: coerce all to lowercase, ignore case when considering duplicates
- -h: print this help and exit
-
- -s n, -b n: (s)hortest and (b)iggest words to pass through
- -d pattern: word delimiter (a Perl regexp)
-
- If no file is given, standard input will be read instead.
-
- Examples:
-
- # list all unique words, ignoring case, in foo:
- words -ui ./foo
-
- # find ten most used words longer than 6 letters in foo:
- words -uci -s6 foo | sort -nr | head -10
-
- HELP
- exit;
- }
-
- if ($opt_b && ($opt_b < $opt_s)) {
- say "illogical length constraint:\n"
- . " [-b]iggest of $opt_b can't be less than [-s]hortest of $opt_s";
- exit 1;
- }
-
- my $everything;
- {
- local $/ = undef; # unset record separator
- while (<>) {
- # one file per iteration, if we're processing filenames
- $everything .= $_;
- }
- }
-
- if (! defined $everything) {
- exit 2;
- }
-
- if ($opt_a) {
- # strip non-alphanumeric characters for current locale:
- $everything =~ s/[^[:alnum:]]/ /gx;
- }
-
- # chop into individual "words" by our delimiter
- # (defaults to any amount of whitespace):
- my %words;
- my $wordcount = 0;
- foreach my $word (split /$opt_d/, $everything) {
- if ($opt_i) {
- $word = lc($word);
- }
-
- # don't allow bigger than -b or shorter than -s letters
- if ($opt_b) { next if length($word) > $opt_b; }
- if ($opt_s) { next if length($word) < $opt_s; }
-
- # skip non-unique words
- if ($opt_u) { next if ( $words{$word}++ ) > 0; }
-
- say $word unless $opt_c;
-
- $wordcount++;
- }
-
- # we've been asked to give a count for each unique word
- if ($opt_u && $opt_c) {
- foreach my $word (keys %words) {
- say "$words{$word}\t$word";
- }
- } elsif ($opt_c) {
- # wc(1) stylee:
- say " $wordcount";
- }
|