I wanted something that could be easily re-used for my text preprocessing. So I created this function.
You can feed an entire corpus, 1 document or even 1 sentence into the “process_text” function.
Function Parameters:
- Stem: if “stem” is set to true the tokens will be processed through a Porter Stemmer
- Lemma: if “lemma” is set to true the tokens will be processed through WordNetLemmatizer, note it is advisable to only use stem or lemma.
- Rare: if “rare” is set to true then the 50 least frequently used words will be dropped
- Common: if “common” is set to true, then the 8 most frequent words will be printed to the screen. Note, this parameter does not change the tokens.
- Freq Plot: if “freqplot” is set to true a Frequency plot of the tokens will be printed. Note that if the input “text” is a collection of documents you’ll get a freq plot for each of the documents, if you submit a variable that contains the corpus you’ll get 1 frequency plot of the entire corpus.
import string import collections import re import pandas as pd from nltk import word_tokenize, regexp_tokenize, wordpunct_tokenize, blankline_tokenize from nltk.stem import PorterStemmer from nltk.corpus import stopwords from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer from pprint import pprint stopwords = nltk.corpus.stopwords.words('english') newStopWords = ['children','speech','language','also','wa','child','age','development','disorder'] stopwords.extend(newStopWords) def process_text(text, stem=True, lemma=False, stop=True, rare=False, common=True, freqplot=False): """ Tokenize text and stem words removing punctuation """ text=re.sub(r'[^\w\s]','',text) tokens = word_tokenize(text) tokens = [x.lower() for x in tokens] if stem: stemmer = PorterStemmer() tokens = [stemmer.stem(t) for t in tokens] #DSI.df['stem']=tokens if lemma: wlem = WordNetLemmatizer() tokens=[wlem.lemmatize(t) for t in tokens] #DSI.df['lemma']=tokens if stop: tokens = [tok for tok in tokens if tok not in stopwords] #DSI.df['stops']=tokens if rare: freq_dist=nltk.FreqDist(tokens) rarewords = collections.Counter(freq_dist).most_common()[-50:] rarewords=[sublist[:1] for sublist in rarewords] rarewords = [item[0] for item in rarewords] tokens = [tok for tok in tokens if tok not in rarewords] if common: freq_dist=nltk.FreqDist(tokens) commonwords=freq_dist.most_common(8) #print("Common words: ",commonwords) if freqplot: freq_dist_nltk = nltk.FreqDist(tokens) plt.figure(figsize=(10, 5)) freq_dist_nltk.plot(50, cumulative=False) sortedfreq=sorted(freq_dist_nltk.items(),key=operator.itemgetter(1),reverse=True) print(sortedfreq[:40]) return tokens
Call the function like this:
for dsiT in dsiText: tokens = process_text(dsiT,stem=False, lemma=True, stop=True, rare=True, common=True, freqplot=False) print(tokens[:10])
or like this
# Corpus Analysis alldsi=[] for dsiT in dsiText: alldsi.append(dsiT) corpus=' '.join(alldsi) ctokens = process_text(corpus,stem=False, lemma=True, stop=True, rare=True, common=True, freqplot=True)
thats all for now…