Finally, to carry out the classification, the concatenation of the TF sentence representation and the word-based features are used as input to the different algorithms (SVM, LR, MLP, MultinomialNB).
df=pd.read_csv("C:/Users/User/Desktop/Dataset with stopword.csv")
df.shape
def noramlize(Tweet):
Tweet = re.sub(r"[?????]", "?", Tweet)
Tweet = re.sub(r"?", "?", Tweet)
Tweet = re.sub(r"?", "?", Tweet)
Tweet = re.sub(r"?", "?", Tweet)
Tweet = re.sub(r'[^?-? ]', "", Tweet)
noise = re.compile(""" ? | # Tashdid
? | # Fatha
? | # Tanwin Fath
? | # Damma
? | # Tanwin Damm
? | # Kasra
? | # Tanwin Kasr
? | # Sukun
? # Tatwil/Kashida
""", re.VERBOSE)
Tweet = re.sub(noise, '', Tweet)
return Tweet
def stopWordRmove(Tweet):
ar_stop_list = open("ar_stop_word_list.txt", "r", encoding="utf8")
stop_words = ar_stop_list.read().split('
')
needed_words = []
words = word_tokenize(Tweet)
for w in words:
if w not in (stop_words):
needed_words.append(w)
filtered_sentence = " ".join(needed_words)
return filtered_sentence
def stemming(Tweet):
st = ISRIStemmer()
stemmed_words = []
words = word_tokenize(Tweet)
for w in words:
stemmed_words.append(st.stem(w))
stemmed_sentence = " ".join(stemmed_words)
return stemmed_sentence
def prepareDataSets(df):
sentences = []
for index, r in df.iterrows():
Tweet = noramlize(r['Tweet'])
Tweet = stopWordRmove(r['Tweet'])
Tweet = stemming(r['Tweet'])
if r['Affect Dimension'] == 'fear':
sentences.append([Tweet, 'fear'])
if r['Affect Dimension'] == 'anger':
sentences.append([Tweet, 'anger'])
if r['Affect Dimension'] == 'joy':
sentences.append([Tweet, 'joy'])
if r['Affect Dimension'] == 'sadness':
sentences.append([Tweet, 'sadness'])
df_sentences = DataFrame(sentences, columns=['Tweet', 'Affect Dimension'])
return df_sentences
preprocessed_df = prepareDataSets(df)
preprocessed_df
def featureExtraction(data):
vectorizer = CountVectorizer()
Count_data = vectorizer.fit_transform(data)
return Count_data
def learning(clf, X, Y):
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.33, random_state=0)
classifer = clf()
classifer.fit(X_train, Y_train)
predict = cross_val_predict(classifer, X_test, Y_test, cv=10, fit_params=None)
scores = cross_val_score(classifer,X_test, Y_test, cv=10)
print (scores)
print ("Accuracy of %s: %0.2f (+/- %0.2f)" % (classifer, scores.mean(), scores.std() *2))
print (classification_report(Y_test, predict))
main(SVC)
clfs = [LogisticRegression, MultinomialNB, MLPClassifier]
for clf in clfs:
main(clf)