cat book.txt | \
tr '!()[]{};:",<.>?“”‘’*/\r' ' ' | \
tr ' ' '\n' | \
grep -a -P "^[\p{L}\p{N}\-']+\$" | \
grep -a -P -v "^[\p{N}\-']+\$" | \
sed "s/'s\$//" | \
sed "s/^'//" | sed "s/'\$//" > words.txt

cat words.txt | \
sort | uniq -c | \
sort -nr | \
cut -c9- > words_desc.txt
  • replace punctuations with space; remove \r from `\r\n’
  • one word per line
  • keep only words composed of unicode letters, numbers, hyphen and apostrophe
  • remove pure numbers
  • remove ’s
  • remove starting and ending apostrophe
  • output words.txt

  • sort and count unique words

  • sort by freqency in descending order

  • trim the frequency column