I have a problem that has blocked me for 2 days, hope that I can find here a solution:
I create a dataframe which contains a list of words and their positive and negative polarities for sentiment analysis task.
word positive.polarity negative.polarity
1 interesting 1 0
2 boring 0 1
For each word I extract its context which is a set of 3 preceding words.
I have always a list of booster words :
-booster_words <- c("more","enough", "a lot", "as", "so")
-negative_words <- c("not", "rien", "ni", "aucun", "nul", "jamais", "pas", "non plus", "sans")
I would like to create a new column positive.ponderate.polarity which contains positive polarity value devided per 3 if there in a booster and negative word in the context, and multiplied per 3 if there is only booster word in the context (there is ni negative word in context).
When I run with this sentence :
"The course was so interesting, but the professor was not boring"
I get this data frame :
word positive.polarity negative.polarity positive.ponderate.polarity
1 interesting 1 0 0.3333333
2 boring 0 1 0.0000000
BUT I have to find as result this dataframe :
word positive.polarity negative.polarity positive.ponderate.polarity
1 interesting 1 0 3
2 boring 0 1 0.0000000
Here is the code :
calcPolarity <- function(sentiment_DF,sentences){
booster_words <- c("more","enough", "a lot", "as", "so")
negative_words <- c("not", "rien", "ni", "aucun", "nul", "jamais", "pas", "non plus", "sans")
reduce_words <- c("peu", "presque", "moins", "seulement")
# pre-allocate the polarity result vector with size = number of sentences
polarity <- rep.int(0,length(sentences))
# loop per sentence
for(i in 1:length(polarity)){
sentence <- sentences[i]
# separate each sentence in words using regular expression
wordsOfASentence <- unlist(regmatches(sentence,gregexpr("[[:word:]]+",sentence,perl=TRUE)))
# get the rows of sentiment_DF corresponding to the words in the sentence using match
# N.B. if a word occurs twice, there will be two equal rows
# (but I think it's correct since in this way you count its polarity twice)
subDF <- sentiment_DF[match(wordsOfASentence,sentiment_DF$word,nomatch = 0),]
# Find (number) of matching word.
wordOfInterest <- wordsOfASentence[which(wordsOfASentence %in% levels(sentiment_DF$word))] # No multigrepl, so working with duplicates instead. eg interesting
regexOfInterest <- paste0("([^\s]+\s){0,3}", wordOfInterest, "(\s[^\s]+){0,3}")
# extract a context of 3 words before the word in the dataframe
context <- stringr::str_extract(sentence, regexOfInterest)
names(context) <- wordOfInterest # Helps in forloop
print(context)
if(any(unlist(strsplit(context, " ")) %in% booster_words))
{
print(booster_words)
if(any(unlist(strsplit(context, " ")) %in% negative_words))
{
subDF$positive.ponderate.polarity <- subDF$positive.polarity / 3
}
else
{
subDF$positive.ponderate.polarity <- subDF$positive.polarity * 3
}
}
# Debug option
print(subDF)
# calculate the total polarity of the sentence and store in the vector
polarity[i] <- sum(subDF$positive.ponderate.polarity) - sum(subDF$negative.ponderate.polarity)
}
return(polarity)
}
sentiment_DF <- data.frame(word=c('interesting','boring','pretty'),
positive.polarity=c(1,0,1),
negative.polarity=c(0,1,0))
sentences <- c("The course was so interesting, but the professor was not boring")
result <- calcPolarity(sentiment_DF,sentences)
Usage :
result <- calcPolarity(sentiment_DF,sentences)
interesting boring
"course was so interesting" "professor was not boring"
[1] "more" "enough" "a lot" "as" "so"
word positive.polarity negative.polarity positive.ponderate.polarity
1 interesting 1 0 0.3333333
2 boring 0 1 0.0000000
EDIT:
calcPolarity <- function(sentiment_DF,sentences){
booster_words <- c("more","enough", "a lot", "as", "so")
negative_words <- c("not", "rien", "ni", "aucun", "nul", "jamais", "pas", "non plus", "sans")
reduce_words <- c("peu", "presque", "moins", "seulement")
# pre-allocate the polarity result vector with size = number of sentences
polarity <- rep.int(0,length(sentences))
# loop per sentence
for(i in 1:length(polarity)){
sentence <- sentences[i]
# separate each sentence in words using regular expression
wordsOfASentence <- unlist(regmatches(sentence,gregexpr("[[:word:]]+",sentence,perl=TRUE)))
# get the rows of sentiment_DF corresponding to the words in the sentence using match
# N.B. if a word occurs twice, there will be two equal rows
# (but I think it's correct since in this way you count its polarity twice)
subDF <- sentiment_DF[match(wordsOfASentence,sentiment_DF$word,nomatch = 0),]
# Find (number) of matching word.
wordOfInterest <- wordsOfASentence[which(wordsOfASentence %in% levels(sentiment_DF$word))] # No multigrepl, so working with duplicates instead. eg interesting
regexOfInterest <- paste0("([^\s]+\s){0,3}", wordOfInterest, "(\s[^\s]+){0,3}")
# extract a context of 3 words before the word in the dataframe
context <- stringr::str_extract(sentence, regexOfInterest)
names(context) <- wordOfInterest # Helps in forloop
print(context)
for(i in 1:length(context)){
if(any(unlist(strsplit(context[i], " ")) %in% booster_words))
{
print(booster_words)
if(any(unlist(strsplit(context[i], " ")) %in% negative_words))
{
subDF$positive.ponderate.polarity <- subDF$positive.polarity + 4
}
else
{
subDF$positive.ponderate.polarity <- subDF$positive.polarity + 9
}
}
}
# Debug option
print(subDF)
# calculate the total polarity of the sentence and store in the vector
polarity[i] <- sum(subDF$positive.ponderate.polarity) - sum(subDF$negative.ponderate.polarity)
}
return(polarity)
}
sentiment_DF <- data.frame(word=c('interesting','boring','pretty'),
positive.polarity=c(1,0,1),
negative.polarity=c(0,1,0))
sentences <- c("The course was interesting, but the professor was not so boring")
result <- calcPolarity(sentiment_DF,sentences)
I get this result :
word positive.polarity negative.polarity positive.ponderate.polarity
1 interesting 1 0 5
2 boring 0 1 4
But it is incorrest, I must have this result
word positive.polarity negative.polarity positive.ponderate.polarity
1 interesting 1 0 1
2 boring 0 1 4
Any idea please?
See Question&Answers more detail:
os