use regex::{Regex, Replacer};
|
|
use std::io::{self, Read};
|
|
use structopt::StructOpt;
|
|
use failure::ResultExt;
|
|
use exitfailure::ExitFailure;
|
|
|
|
#[derive(StructOpt)]
|
|
#[structopt(name = "bpb-words", about = "Split files into individual words.")]
|
|
struct Cli {
|
|
/// (f)ile to read, or - for stdin
|
|
#[structopt(default_value = "-")]
|
|
path: String,
|
|
|
|
/// (d)elimiter to split on - a regular expression
|
|
#[structopt(short, long, default_value = r"\s+")]
|
|
delimiter: String,
|
|
|
|
/// Strip non-alphanumeric characters
|
|
#[structopt(short, long)]
|
|
alpha_only: bool,
|
|
|
|
/// Coerce all words to lower-case
|
|
#[structopt(short, long)]
|
|
lower_case: bool,
|
|
|
|
/// (s)hortest word to pass through
|
|
#[structopt(short, long)]
|
|
shortest: Option<u64>,
|
|
|
|
/// (b)iggest word to pass through
|
|
#[structopt(short, long)]
|
|
biggest: Option<u64>,
|
|
|
|
}
|
|
|
|
// TODO:
|
|
// - [X] command-line help
|
|
// - [X] file input
|
|
// - [X] support specifying split delimiter pattern
|
|
// - [X] support stripping non-alphanumeric chars
|
|
// - [X] support coercing to lowercase
|
|
// - [ ] support max and min length of words to pass through
|
|
|
|
fn main() -> Result<(), ExitFailure> {
|
|
let args = Cli::from_args();
|
|
|
|
let mut content = String::new();
|
|
|
|
if args.path == "-" {
|
|
let stdin = io::stdin();
|
|
let mut handle = stdin.lock();
|
|
handle.read_to_string(&mut content)?;
|
|
} else {
|
|
// https://rust-lang-nursery.github.io/cli-wg/tutorial/errors.html
|
|
content = std::fs::read_to_string(&args.path)
|
|
.with_context(|_| format!("could not read file `{}`", &args.path))?;
|
|
}
|
|
|
|
let delimiter = Regex::new(&args.delimiter)?;
|
|
|
|
// bpb_words::split_words(&delimiter, &content, &mut words)?;
|
|
|
|
let split = delimiter.split(&content);
|
|
|
|
for word in split {
|
|
// Handle stripping non a-z (ish) characters:
|
|
let mut replaced = String::new();
|
|
let word = if args.alpha_only {
|
|
replaced = bpb_words::replace_nonalpha(&word, "");
|
|
replaced.as_str()
|
|
} else {
|
|
word
|
|
};
|
|
|
|
// Handle lowercasing:
|
|
let mut lc = String::new();
|
|
let word = if args.lower_case {
|
|
lc = word.to_lowercase();
|
|
lc.as_str()
|
|
} else {
|
|
word
|
|
};
|
|
|
|
let count = word.chars().count();
|
|
|
|
let pass_max_len = match args.biggest {
|
|
// Some(max_len) => count <= max_len,
|
|
Some(max_len) => count as u64 <= max_len,
|
|
None => true,
|
|
};
|
|
|
|
let pass_min_len = match args.shortest {
|
|
// Some(max_len) => count <= max_len,
|
|
Some(min_len) => count as u64 >= min_len,
|
|
None => true,
|
|
};
|
|
|
|
if (pass_min_len && pass_max_len) {
|
|
println!("{}", word);
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|