A Rust clone of a Perl word-splitting program.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

104 lines
2.8 KiB

  1. use regex::{Regex, Replacer};
  2. use std::io::{self, Read};
  3. use structopt::StructOpt;
  4. use failure::ResultExt;
  5. use exitfailure::ExitFailure;
  6. #[derive(StructOpt)]
  7. #[structopt(name = "bpb-words", about = "Split files into individual words.")]
  8. struct Cli {
  9. /// (f)ile to read, or - for stdin
  10. #[structopt(default_value = "-")]
  11. path: String,
  12. /// (d)elimiter to split on - a regular expression
  13. #[structopt(short, long, default_value = r"\s+")]
  14. delimiter: String,
  15. /// Strip non-alphanumeric characters
  16. #[structopt(short, long)]
  17. alpha_only: bool,
  18. /// Coerce all words to lower-case
  19. #[structopt(short, long)]
  20. lower_case: bool,
  21. /// (s)hortest word to pass through
  22. #[structopt(short, long)]
  23. shortest: Option<u64>,
  24. /// (b)iggest word to pass through
  25. #[structopt(short, long)]
  26. biggest: Option<u64>,
  27. }
  28. // TODO:
  29. // - [X] command-line help
  30. // - [X] file input
  31. // - [X] support specifying split delimiter pattern
  32. // - [X] support stripping non-alphanumeric chars
  33. // - [X] support coercing to lowercase
  34. // - [ ] support max and min length of words to pass through
  35. fn main() -> Result<(), ExitFailure> {
  36. let args = Cli::from_args();
  37. let mut content = String::new();
  38. if args.path == "-" {
  39. let stdin = io::stdin();
  40. let mut handle = stdin.lock();
  41. handle.read_to_string(&mut content)?;
  42. } else {
  43. // https://rust-lang-nursery.github.io/cli-wg/tutorial/errors.html
  44. content = std::fs::read_to_string(&args.path)
  45. .with_context(|_| format!("could not read file `{}`", &args.path))?;
  46. }
  47. let delimiter = Regex::new(&args.delimiter)?;
  48. // bpb_words::split_words(&delimiter, &content, &mut words)?;
  49. let split = delimiter.split(&content);
  50. for word in split {
  51. // Handle stripping non a-z (ish) characters:
  52. let mut replaced = String::new();
  53. let word = if args.alpha_only {
  54. replaced = bpb_words::replace_nonalpha(&word, "");
  55. replaced.as_str()
  56. } else {
  57. word
  58. };
  59. // Handle lowercasing:
  60. let mut lc = String::new();
  61. let word = if args.lower_case {
  62. lc = word.to_lowercase();
  63. lc.as_str()
  64. } else {
  65. word
  66. };
  67. let count = word.chars().count();
  68. let pass_max_len = match args.biggest {
  69. // Some(max_len) => count <= max_len,
  70. Some(max_len) => count as u64 <= max_len,
  71. None => true,
  72. };
  73. let pass_min_len = match args.shortest {
  74. // Some(max_len) => count <= max_len,
  75. Some(min_len) => count as u64 >= min_len,
  76. None => true,
  77. };
  78. if (pass_min_len && pass_max_len) {
  79. println!("{}", word);
  80. }
  81. }
  82. Ok(())
  83. }