cat book.txt | \
tr '!()[]{};:",<.>?“”‘’*/\r' ' ' | \
tr ' ' '\n' | \
grep -a -P "^[\p{L}\p{N}\-']+\$" | \
grep -a -P -v "^[\p{N}\-']+\$" | \
sed "s/'s\$//" | \
sed "s/^'//" | sed "s/'\$//" > words.txt
cat words.txt | \
sort | uniq -c | \
sort -nr | \
cut -c9- > words_desc.txt
- replace punctuations with space; remove
\r
from `\r\n’ - one word per line
- keep only words composed of unicode letters, numbers, hyphen and apostrophe
- remove pure numbers
- remove ’s
- remove starting and ending apostrophe
output words.txt
sort and count unique words
sort by freqency in descending order
trim the frequency column