A Rust clone of a Perl word-splitting program.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

104 lines
2.8 KiB

use regex::{Regex, Replacer};
use std::io::{self, Read};
use structopt::StructOpt;
use failure::ResultExt;
use exitfailure::ExitFailure;
#[derive(StructOpt)]
#[structopt(name = "bpb-words", about = "Split files into individual words.")]
struct Cli {
/// (f)ile to read, or - for stdin
#[structopt(default_value = "-")]
path: String,
/// (d)elimiter to split on - a regular expression
#[structopt(short, long, default_value = r"\s+")]
delimiter: String,
/// Strip non-alphanumeric characters
#[structopt(short, long)]
alpha_only: bool,
/// Coerce all words to lower-case
#[structopt(short, long)]
lower_case: bool,
/// (s)hortest word to pass through
#[structopt(short, long)]
shortest: Option<u64>,
/// (b)iggest word to pass through
#[structopt(short, long)]
biggest: Option<u64>,
}
// TODO:
// - [X] command-line help
// - [X] file input
// - [X] support specifying split delimiter pattern
// - [X] support stripping non-alphanumeric chars
// - [X] support coercing to lowercase
// - [ ] support max and min length of words to pass through
fn main() -> Result<(), ExitFailure> {
let args = Cli::from_args();
let mut content = String::new();
if args.path == "-" {
let stdin = io::stdin();
let mut handle = stdin.lock();
handle.read_to_string(&mut content)?;
} else {
// https://rust-lang-nursery.github.io/cli-wg/tutorial/errors.html
content = std::fs::read_to_string(&args.path)
.with_context(|_| format!("could not read file `{}`", &args.path))?;
}
let delimiter = Regex::new(&args.delimiter)?;
// bpb_words::split_words(&delimiter, &content, &mut words)?;
let split = delimiter.split(&content);
for word in split {
// Handle stripping non a-z (ish) characters:
let mut replaced = String::new();
let word = if args.alpha_only {
replaced = bpb_words::replace_nonalpha(&word, "");
replaced.as_str()
} else {
word
};
// Handle lowercasing:
let mut lc = String::new();
let word = if args.lower_case {
lc = word.to_lowercase();
lc.as_str()
} else {
word
};
let count = word.chars().count();
let pass_max_len = match args.biggest {
// Some(max_len) => count <= max_len,
Some(max_len) => count as u64 <= max_len,
None => true,
};
let pass_min_len = match args.shortest {
// Some(max_len) => count <= max_len,
Some(min_len) => count as u64 >= min_len,
None => true,
};
if (pass_min_len && pass_max_len) {
println!("{}", word);
}
}
Ok(())
}