|
|
- use regex::{Regex, Replacer};
- use std::io::{self, Read};
- use structopt::StructOpt;
- use failure::ResultExt;
- use exitfailure::ExitFailure;
-
- #[derive(StructOpt)]
- #[structopt(name = "bpb-words", about = "Split files into individual words.")]
- struct Cli {
- /// (f)ile to read, or - for stdin
- #[structopt(default_value = "-")]
- path: String,
-
- /// (d)elimiter to split on - a regular expression
- #[structopt(short, long, default_value = r"\s+")]
- delimiter: String,
-
- /// Strip non-alphanumeric characters
- #[structopt(short, long)]
- alpha_only: bool,
-
- /// Coerce all words to lower-case
- #[structopt(short, long)]
- lower_case: bool,
-
- /// (s)hortest word to pass through
- #[structopt(short, long)]
- shortest: Option<u64>,
-
- /// (b)iggest word to pass through
- #[structopt(short, long)]
- biggest: Option<u64>,
-
- }
-
- // TODO:
- // - [X] command-line help
- // - [X] file input
- // - [X] support specifying split delimiter pattern
- // - [X] support stripping non-alphanumeric chars
- // - [X] support coercing to lowercase
- // - [ ] support max and min length of words to pass through
-
- fn main() -> Result<(), ExitFailure> {
- let args = Cli::from_args();
-
- let mut content = String::new();
-
- if args.path == "-" {
- let stdin = io::stdin();
- let mut handle = stdin.lock();
- handle.read_to_string(&mut content)?;
- } else {
- // https://rust-lang-nursery.github.io/cli-wg/tutorial/errors.html
- content = std::fs::read_to_string(&args.path)
- .with_context(|_| format!("could not read file `{}`", &args.path))?;
- }
-
- let delimiter = Regex::new(&args.delimiter)?;
-
- // bpb_words::split_words(&delimiter, &content, &mut words)?;
-
- let split = delimiter.split(&content);
-
- for word in split {
- // Handle stripping non a-z (ish) characters:
- let mut replaced = String::new();
- let word = if args.alpha_only {
- replaced = bpb_words::replace_nonalpha(&word, "");
- replaced.as_str()
- } else {
- word
- };
-
- // Handle lowercasing:
- let mut lc = String::new();
- let word = if args.lower_case {
- lc = word.to_lowercase();
- lc.as_str()
- } else {
- word
- };
-
- let count = word.chars().count();
-
- let pass_max_len = match args.biggest {
- // Some(max_len) => count <= max_len,
- Some(max_len) => count as u64 <= max_len,
- None => true,
- };
-
- let pass_min_len = match args.shortest {
- // Some(max_len) => count <= max_len,
- Some(min_len) => count as u64 >= min_len,
- None => true,
- };
-
- if (pass_min_len && pass_max_len) {
- println!("{}", word);
- }
- }
-
- Ok(())
- }
|