text cleaning

I was requested to clean up a really messy text file. It’s a word file with names, addresses, phone numbers, schools, spouses dates, and random notes in no consistent order, a lot of missing information, and duplicates!

Initialization and read in code

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from commonregex import CommonRegex
parser = CommonRegex()
names=[[]]
with open('../input/regionalliststxt.txt') as fp:
text = fp.read(12000)
print('** READ in TEXT: **\n')
for region in text.split('!!Region'):
regname=region[0:50].split('!!')[1]
print(regname)
pplinReg=region[50:]
split=pplinReg.split('%')
print(len(split),type(split),split[3])
for i in split:
names.append([regname,i])
df = pd.DataFrame(names, columns=['Region','PersonEntry'])

Now lets clean the entries:


df['PersonEntry']=df['PersonEntry'].str.replace(r'\n+', '')
df['phone'] = [parser.phones(x) for x in df.PersonEntry]
df['email'] = [parser.emails(x) for x in df.PersonEntry]
df['address'] = [parser.btc_addresses(x) for x in df.PersonEntry]
# NAME
#df['Name'] = df['PersonEntry'].str.split('(',1,expand=True)
# ADDRESS # + street address + city +wa (2 digit) +zip
#df['ADDREASS'] = df['PersonEntry'].str.findall("([0-9]+[a-zA-Z0-9_.+-]+\s[a-zA-Z]+\,[a-zA-Z{2}]+[0-9-]+)")
#df['ADDRESS'] = df['PersonEntry'].str.findall(/\d+(\s+\w+){1,}\s+(?:st(?:\.|reet)?|dr(?:\.|ive)?|pl(?:\.|ace)?|ave(?:\.|nue)?|rd|road|lane|drive|way|court|plaza|square|run|parkway|point|pike|square|driveway|trace|park|terrace|blvd)/)
# MEMBER SINCE
df['since'] = df['PersonEntry'].str.findall("Member of OVERSEAS BRATS since+'\d+")
# type
df['AFB'] = df['PersonEntry'].str.findall("Air Force Brat")
df['Army'] = df['PersonEntry'].str.findall("Army Brat")
df['Navy'] = df['PersonEntry'].str.findall("Navy Brat")
df.head(10)

Leave a comment