#!/usr/bin/env perl
|
|
|
|
# For a sort of rebuttal of this program, see:
|
|
# http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/
|
|
|
|
use strict;
|
|
use warnings;
|
|
use 5.10.0;
|
|
use Getopt::Std;
|
|
|
|
our ($opt_h, $opt_i, $opt_u, $opt_a, $opt_c, $opt_b, $opt_s, $opt_d) = (0, 0, 0, 0, 0, 0, 0, '\s+');
|
|
getopts('huacib:s:d:');
|
|
|
|
# documentate
|
|
if ($opt_h) {
|
|
print <<'HELP';
|
|
Usage: words [-ucaih] [-s n] [-b n] [-d pattern] [file]
|
|
Split input into individual words, crudely understood.
|
|
|
|
-u: print each unique word only once
|
|
-c: print a count of words and exit
|
|
-uc: print a count for each unique word
|
|
-a: strip non-alphanumeric characters for current locale
|
|
-i: coerce all to lowercase, ignore case when considering duplicates
|
|
-h: print this help and exit
|
|
|
|
-s n, -b n: (s)hortest and (b)iggest words to pass through
|
|
-d pattern: word delimiter (a Perl regexp)
|
|
|
|
If no file is given, standard input will be read instead.
|
|
|
|
Examples:
|
|
|
|
# list all unique words, ignoring case, in foo:
|
|
words -ui ./foo
|
|
|
|
# find ten most used words longer than 6 letters in foo:
|
|
words -uci -s6 foo | sort -nr | head -10
|
|
|
|
HELP
|
|
exit;
|
|
}
|
|
|
|
if ($opt_b && ($opt_b < $opt_s)) {
|
|
say "illogical length constraint:\n"
|
|
. " [-b]iggest of $opt_b can't be less than [-s]hortest of $opt_s";
|
|
exit 1;
|
|
}
|
|
|
|
my $everything;
|
|
{
|
|
local $/ = undef; # unset record separator
|
|
while (<>) {
|
|
# one file per iteration, if we're processing filenames
|
|
$everything .= $_;
|
|
}
|
|
}
|
|
|
|
if (! defined $everything) {
|
|
exit 2;
|
|
}
|
|
|
|
if ($opt_a) {
|
|
# strip non-alphanumeric characters for current locale:
|
|
$everything =~ s/[^[:alnum:]]/ /gx;
|
|
}
|
|
|
|
# chop into individual "words" by our delimiter
|
|
# (defaults to any amount of whitespace):
|
|
my %words;
|
|
my $wordcount = 0;
|
|
foreach my $word (split /$opt_d/, $everything) {
|
|
if ($opt_i) {
|
|
$word = lc($word);
|
|
}
|
|
|
|
# don't allow bigger than -b or shorter than -s letters
|
|
if ($opt_b) { next if length($word) > $opt_b; }
|
|
if ($opt_s) { next if length($word) < $opt_s; }
|
|
|
|
# skip non-unique words
|
|
if ($opt_u) { next if ( $words{$word}++ ) > 0; }
|
|
|
|
say $word unless $opt_c;
|
|
|
|
$wordcount++;
|
|
}
|
|
|
|
# we've been asked to give a count for each unique word
|
|
if ($opt_u && $opt_c) {
|
|
foreach my $word (keys %words) {
|
|
say "$words{$word}\t$word";
|
|
}
|
|
} elsif ($opt_c) {
|
|
# wc(1) stylee:
|
|
say " $wordcount";
|
|
}
|