At the moment, I use that but I'm sure it's not efficient at all and I would like to improve my code :
df1['comms_clean'] = df1['comms'].apply(lambda x : clean_text(x))
df2['comms_clean'] = df2['comms'].apply(lambda x : clean_text(x))
df3['comms_clean'] = df3['comms'].apply(lambda x : clean_text(x))
df4['comms_clean'] = df4['comms'].apply(lambda x : clean_text(x))
df5['comms_clean'] = df5['comms'].apply(lambda x : clean_text(x))
df6['comms_clean'] = df6['comms'].apply(lambda x : clean_text(x))
df7['comms_clean'] = df7['comms'].apply(lambda x : clean_text(x))
df8['comms_clean'] = df8['comms'].apply(lambda x : clean_text(x))
df9['comms_clean'] = df9['comms'].apply(lambda x : clean_text(x))
df10['comms_clean'] = df10['comms'].apply(lambda x : clean_text(x))
df11['comms_clean'] = df11['comms'].apply(lambda x : clean_text(x))
df12['comms_clean'] = df12['comms'].apply(lambda x : clean_text(x))
df13['comms_clean'] = df13['comms'].apply(lambda x : clean_text(x))
df14['comms_clean'] = df14['comms'].apply(lambda x : clean_text(x))
df15['comms_clean'] = df15['comms'].apply(lambda x : clean_text(x))
For now I tried this but I cannot find out how to use the i
in my loop
:
for i in range(1,15):
df{i}['comms_clean'] = df{i}['comms'].apply(lambda x : clean_text(x))
Here's my function clean_text()
:
def clean_text(text):
text = text.lower() #make text lowercase and fill na
text = re.sub('[.*?]', '', text)
text = re.sub('\n', '',str(text))
text = re.sub("[[User.*",'',str(text))
text = re.sub("d{1,3}.d{1,3}.d{1,3}.d{1,3}",'',str(text))
text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) #remove hyperlinks
text = re.sub(r':(.*?):', '', text) #remove emoticones
text = re.sub(r'[w.-]+@[w.-]+', '', str(text)) #remove email
text = re.sub(r'(?<=@)w+', '', text) #remove @
text = re.sub(r'[0-9]+', '', text) #remove numbers
text = re.sub("[^A-Za-z0-9 ]", '', text) #remove non alphanumeric like ['@', '#', '.', '(', ')']
text = re.sub(r'[!"$%&'()*+,-./:;=#@?[\]^_`{|}~]*', '', text) #remove punctuations from sentences
text = re.sub('<.*?>+', '', str(text))
text = re.sub('[%s]' % re.escape(string.punctuation), '', str(text))
text = re.sub('w*dw*', '', str(text))
text = tokenizer.tokenize(text)
text = [word for word in text if not word in stop_words]
text = [lemmatizer.lemmatize(word) for word in text]
#text = [stemmer.stem(word) for word in text]
final_text = ' '.join( [w for w in text if len(w)>1] ) #remove word with one letter
return final_text
ANd here's the code used to obtain my dataframe (that's the script for one, for the other I just change the html sources) :
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://www.tripadvisor.ca/Hotel_Review-g186338-d215539-Reviews-or'
urls = [ '{root}{i}-OYO_Apollo_Hotel_Bayswater-London_England.html#REVIEWS'.format(root=root_url, i=i) for i in range(5,440,5) ]
comms = []
notes = []
#datestostay = []
dates = []
for url in urls:
results = requests.get(url)
time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
commentary = soup.find_all('div', class_='_2wrUUKlw _3hFEdNs8')
for container in commentary:
comm = container.find('q', class_ = 'IRsGHoPm').text.strip()
comms.append(comm)
#date_tag = container.find("div", class_="_1O8E5N17").text
#date_text,date_value = str.split(date_tag,':')
#datestostay.append(date_value)
comm1 = str(container.find("div", class_="nf9vGX55").find('span'))
rat = re.findall(r'd+', str(comm1))
rat1 = (str(rat))[2]
notes.append(rat1)
datereal = container.find("div", class_= "_2fxQ4TOx").text
date = datereal[-9:]
dates.append(date)
data = pd.DataFrame({
'comms' : comms,
#'datestostay' : datestostay,
'notes' : notes,
'dates' : dates
})
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
data['dates'] = data.dates.dt.strftime('%Y-%m')
'''
data['datestostay'] = pd.to_datetime(data['datestostay']).dt.date
data['datestostay'] = pd.to_datetime(data['datestostay'])
data['datestostay'] = data.datestostay.dt.strftime('%Y-%m')
'''
#print(data.head())
data.to_csv('table4.csv', sep=';', index=False)
Thanks for your help :)