Dotfiles, utilities, and other apparatus.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

97 lines
2.3 KiB

  1. #!/usr/bin/env perl
  2. # For a sort of rebuttal of this program, see:
  3. # http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/
  4. use strict;
  5. use warnings;
  6. use 5.10.0;
  7. use Getopt::Std;
  8. our ($opt_h, $opt_i, $opt_u, $opt_a, $opt_c, $opt_b, $opt_s, $opt_d) = (0, 0, 0, 0, 0, 0, 0, '\s+');
  9. getopts('huacib:s:d:');
  10. # documentate
  11. if ($opt_h) {
  12. print <<'HELP';
  13. Usage: words [-ucaih] [-s n] [-b n] [-d pattern] [file]
  14. Split input into individual words, crudely understood.
  15. -u: print each unique word only once
  16. -c: print a count of words and exit
  17. -uc: print a count for each unique word
  18. -a: strip non-alphanumeric characters for current locale
  19. -i: coerce all to lowercase, ignore case when considering duplicates
  20. -h: print this help and exit
  21. -s n, -b n: (s)hortest and (b)iggest words to pass through
  22. -d pattern: word delimiter (a Perl regexp)
  23. If no file is given, standard input will be read instead.
  24. Examples:
  25. # list all unique words, ignoring case, in foo:
  26. words -ui ./foo
  27. # find ten most used words longer than 6 letters in foo:
  28. words -uci -s6 foo | sort -nr | head -10
  29. HELP
  30. exit;
  31. }
  32. if ($opt_b && ($opt_b < $opt_s)) {
  33. say "illogical length constraint:\n"
  34. . " [-b]iggest of $opt_b can't be less than [-s]hortest of $opt_s";
  35. exit 1;
  36. }
  37. my $everything;
  38. {
  39. local $/ = undef; # unset record separator
  40. while (<>) {
  41. # one file per iteration, if we're processing filenames
  42. $everything .= $_;
  43. }
  44. }
  45. if (! defined $everything) {
  46. exit 2;
  47. }
  48. if ($opt_a) {
  49. # strip non-alphanumeric characters for current locale:
  50. $everything =~ s/[^[:alnum:]]/ /gx;
  51. }
  52. # chop into individual "words" by our delimiter
  53. # (defaults to any amount of whitespace):
  54. my %words;
  55. my $wordcount = 0;
  56. foreach my $word (split /$opt_d/, $everything) {
  57. if ($opt_i) {
  58. $word = lc($word);
  59. }
  60. # don't allow bigger than -b or shorter than -s letters
  61. if ($opt_b) { next if length($word) > $opt_b; }
  62. if ($opt_s) { next if length($word) < $opt_s; }
  63. # skip non-unique words
  64. if ($opt_u) { next if ( $words{$word}++ ) > 0; }
  65. say $word unless $opt_c;
  66. $wordcount++;
  67. }
  68. # we've been asked to give a count for each unique word
  69. if ($opt_u && $opt_c) {
  70. foreach my $word (keys %words) {
  71. say "$words{$word}\t$word";
  72. }
  73. } elsif ($opt_c) {
  74. # wc(1) stylee:
  75. say " $wordcount";
  76. }