cat book.txt | \
tr '!()[]{};:",<.>?“”‘’*/\r' ' ' | \
tr ' ' '\n' | \
grep -a -P "^[\p{L}\p{N}\-']+\$" | \
grep -a -P -v "^[\p{N}\-']+\$" | \
sed "s/'s\$//" | \
sed "s/^'//" | sed "s/'\$//" > words.txt
cat words.txt | \
sort | uniq -c | \
sort -nr | \
cut -c9- > words_desc.txt
-
replace punctuations with space; remove
\r
from `\r\n' -
one word per line
-
keep only words composed of unicode letters, numbers, hyphen and apostrophe
-
remove pure numbers
-
remove ’s
-
remove starting and ending apostrophe
-
output words.txt
-
sort and count unique words
-
sort by freqency in descending order
-
trim the frequency column