Text Processing – putting it all together

I wanted something that could be easily re-used for my text preprocessing.  So I created this function.

You can feed an entire corpus, 1 document or even 1 sentence into the “process_text” function.

Function Parameters:

  • Stem: if “stem” is set to true the tokens will be processed through a Porter Stemmer
  • Lemma: if “lemma” is set to true the tokens will be processed through WordNetLemmatizer, note it is advisable to only use stem or lemma.
  • Rare: if “rare” is set to true then the 50 least frequently used words will be dropped
  • Common: if “common” is set to true, then the 8 most frequent words will be printed to the screen. Note, this parameter does not change the tokens.
  • Freq Plot: if “freqplot” is set to true a Frequency plot of the tokens will be printed. Note that if the input “text” is a collection of documents you’ll get a freq plot for each of the documents, if you submit a variable that contains the corpus you’ll get 1 frequency plot of the entire corpus.
import string
import collections
import re
import pandas as pd
 
from nltk import word_tokenize, regexp_tokenize, wordpunct_tokenize, blankline_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['children','speech','language','also','wa','child','age','development','disorder']
stopwords.extend(newStopWords)
 
def process_text(text, stem=True, lemma=False, stop=True, rare=False, common=True, freqplot=False):
 """ Tokenize text and stem words removing punctuation """
 text=re.sub(r'[^\w\s]','',text) 
 tokens = word_tokenize(text)
 tokens = [x.lower() for x in tokens] 
 if stem:
 stemmer = PorterStemmer()
 tokens = [stemmer.stem(t) for t in tokens]
 #DSI.df['stem']=tokens

if lemma:
 wlem = WordNetLemmatizer()
 tokens=[wlem.lemmatize(t) for t in tokens]
 #DSI.df['lemma']=tokens

if stop:
 tokens = [tok for tok in tokens if tok not in stopwords]
 #DSI.df['stops']=tokens
 
 if rare:
 freq_dist=nltk.FreqDist(tokens)
 rarewords = collections.Counter(freq_dist).most_common()[-50:]
 rarewords=[sublist[:1] for sublist in rarewords]
 rarewords = [item[0] for item in rarewords]
 tokens = [tok for tok in tokens if tok not in rarewords]
 
 if common:
 freq_dist=nltk.FreqDist(tokens)
 commonwords=freq_dist.most_common(8)
 #print("Common words: ",commonwords)
 
 if freqplot:
 freq_dist_nltk = nltk.FreqDist(tokens)
 plt.figure(figsize=(10, 5))
 freq_dist_nltk.plot(50, cumulative=False)
 sortedfreq=sorted(freq_dist_nltk.items(),key=operator.itemgetter(1),reverse=True)
 print(sortedfreq[:40])
 
 return tokens

Call the function like this:

for dsiT in dsiText:
 tokens = process_text(dsiT,stem=False, lemma=True, stop=True, rare=True, common=True, freqplot=False)
print(tokens[:10])

or like this

# Corpus Analysis
alldsi=[]
for dsiT in dsiText:
 alldsi.append(dsiT)
corpus=' '.join(alldsi)
ctokens = process_text(corpus,stem=False, lemma=True, stop=True, rare=True, common=True, freqplot=True)

thats all for now…