I am trying to use part of speech tagging from the openNLP/NLP packages in parallel. I need the code to work on any OS so am opting to use the parLapply
function from parallel (but am open to other OS independent options). In the past I ran tagPOS
function from the openNLP package in parLapply
with no problem. However, the openNLP package had some recent changes that eliminated tagPOS
and added some more flexible options. Kurt was kind enough to help me recreate the tagPOS
function from the new package's tools. I can get the lapply
version to work but not the parallel version. It keeps saying the nodes need more variables passed to them until it finally asks for a non-exported function from openNLP. This seems odd it would keep asking for more and more variables to be passed which tells me I'm setting up the parLapply
incorrectly. How can I set up the tagPOS
to operate in an parallel, OS independent fashion?
library(openNLP)
library(NLP)
library(parallel)
## POS tagger
tagPOS <- function(x, pos_tag_annotator, ...) {
s <- as.String(x)
## Need sentence and word token annotations.
word_token_annotator <- Maxent_Word_Token_Annotator()
a2 <- Annotation(1L, "sentence", 1L, nchar(s))
a2 <- annotate(s, word_token_annotator, a2)
a3 <- annotate(s, pos_tag_annotator, a2)
## Determine the distribution of POS tags for word tokens.
a3w <- a3[a3$type == "word"]
POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
## Extract token/POS pairs (all of them): easy.
POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
list(POStagged = POStagged, POStags = POStags)
} ## End of tagPOS function
## Set up a parallel run
text.var <- c("I like it.", "This is outstanding soup!",
"I really must get the recipe.")
ntv <- length(text.var)
PTA <- Maxent_POS_Tag_Annotator()
cl <- makeCluster(mc <- getOption("cl.cores", detectCores()/2))
clusterExport(cl=cl, varlist=c("text.var", "ntv",
"tagPOS", "PTA", "as.String", "Maxent_Word_Token_Annotator"),
envir = environment())
m <- parLapply(cl, seq_len(ntv), function(i) {
x <- tagPOS(text.var[i], PTA)
return(x)
}
)
stopCluster(cl)
## Error in checkForRemoteErrors(val) :
## 3 nodes produced errors; first error: could not find function
## "Maxent_Simple_Word_Tokenizer"
openNLP::Maxent_Simple_Word_Tokenizer
## >openNLP::Maxent_Simple_Word_Tokenizer
## Error: 'Maxent_Simple_Word_Tokenizer' is not an exported
## object from 'namespace:openNLP'
## It's a non exported function
openNLP:::Maxent_Simple_Word_Tokenizer
## Demo that it works with lapply
lapply(seq_len(ntv), function(i) {
tagPOS(text.var[i], PTA)
})
lapply(text.var, function(x) {
tagPOS(x, PTA)
})
## > lapply(seq_len(ntv), function(i) {
## + tagPOS(text.var[i], PTA)
## + })
## [[1]]
## [[1]]$POStagged
## [1] "I/PRP like/IN it/PRP ./."
##
## [[1]]$POStags
## [1] "PRP" "IN" "PRP" "."
##
## [[1]]$word.count
## [1] 3
##
##
## [[2]]
## [[2]]$POStagged
## [1] "THis/DT is/VBZ outstanding/JJ soup/NN !/."
##
## [[2]]$POStags
## [1] "DT" "VBZ" "JJ" "NN" "."
##
## [[2]]$word.count
## [1] 4
##
##
## [[3]]
## [[3]]$POStagged
## [1] "I/PRP really/RB must/MD get/VB the/DT recip/NN ./."
##
## [[3]]$POStags
## [1] "PRP" "RB" "MD" "VB" "DT" "NN" "."
##
## [[3]]$word.count
## [1] 6
EDIT: per Steve's suggestion
Note the openNLP is brand new. I installed ver 2.1 from a tar.gz from CRAN. I get the following error even though this function exists.
library(openNLP); library(NLP); library(parallel)
tagPOS <- function(text.var, pos_tag_annotator, ...) {
s <- as.String(text.var)
## Set up the POS annotator if missing (for parallel)
if (missing(pos_tag_annotator)) {
PTA <- Maxent_POS_Tag_Annotator()
}
## Need sentence and word token annotations.
word_token_annotator <- Maxent_Word_Token_Annotator()
a2 <- Annotation(1L, "sentence", 1L, nchar(s))
a2 <- annotate(s, word_token_annotator, a2)
a3 <- annotate(s, PTA, a2)
## Determine the distribution of POS tags for word tokens.
a3w <- a3[a3$type == "word"]
POStags <- unlist(lapply(a3w$features, "[[", "POS"))
## Extract token/POS pairs (all of them): easy.
POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
list(POStagged = POStagged, POStags = POStags)
}
text.var <- c("I like it.", "This is outstanding soup!",
"I really must get the recipe.")
cl <- makeCluster(mc <- getOption("cl.cores", detectCores()/2))
clusterEvalQ(cl, {library(openNLP); library(NLP)})
m <- parLapply(cl, text.var, tagPOS)
## > m <- parLapply(cl, text.var, tagPOS)
## Error in checkForRemoteErrors(val) :
## 3 nodes produced errors; first error: could not find function "Maxent_POS_Tag_Annotator"
stopCluster(cl)
> packageDescription('openNLP')
Package: openNLP
Encoding: UTF-8
Version: 0.2-1
Title: Apache OpenNLP Tools Interface
Authors@R: person("Kurt", "Hornik", role = c("aut", "cre"), email =
"[email protected]")
Description: An interface to the Apache OpenNLP tools (version 1.5.3). The Apache OpenNLP
library is a machine learning based toolkit for the processing of natural language
text written in Java. It supports the most common NLP tasks, such as tokenization,
sentence segmentation, part-of-speech tagging, named entity extraction, chunking,
parsing, and coreference resolution. See http://opennlp.apache.org/ for more
information.
Imports: NLP (>= 0.1-0), openNLPdata (>= 1.5.3-1), rJava (>= 0.6-3)
SystemRequirements: Java (>= 5.0)
License: GPL-3
Packaged: 2013-08-20 13:23:54 UTC; hornik
Author: Kurt Hornik [aut, cre]
Maintainer: Kurt Hornik <[email protected]>
NeedsCompilation: no
Repository: CRAN
Date/Publication: 2013-08-20 15:41:22
Built: R 3.0.1; ; 2013-08-20 13:48:47 UTC; windows
See Question&Answers more detail:
os