I have a dataframe in pyspark which I am doing some one hot encoding .
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
categoricalColumns = [
"incomedetails",
"b2c",
"gender",
"occupation",
"poa_status",
"ac_type",
]
# categoricalColumns = []
stages = []
for categoricalCol in categoricalColumns:
stringIndexer = StringIndexer(
inputCol=categoricalCol, outputCol=categoricalCol + "Index"
)
encoder = OneHotEncoderEstimator(
inputCols=[stringIndexer.getOutputCol()],
outputCols=[categoricalCol + "classVec"],
)
stages += [stringIndexer, encoder]
label_stringIdx = StringIndexer(inputCol="BSConfirmBuy", outputCol="label")
stages += [label_stringIdx]
new_col_array = [x for x in new_df1.columns]
numericCols = new_col_array
numericCols.remove("BSConfirmBuy")
# numericCols.remove('client_id')
numericCols.remove("incomedetails")
numericCols.remove("b2c")
numericCols.remove("gender")
numericCols.remove("occupation")
numericCols.remove("poa_status")
numericCols.remove("ac_type")
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(new_df1)
new_df1 = pipelineModel.transform(new_df1)
I am getting this error
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o4626.transform.
: **java.lang.IllegalArgumentException: requirement failed: Cannot have an empty string for name.**
at scala.Predef$.require(Predef.scala:224)
at org.apache.spark.ml.attribute.Attribute$$anonfun$5.apply(attributes.scala:33)
at org.apache.spark.ml.attribute.Attribute$$anonfun$5.apply(attributes.scala:32)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.ml.attribute.Attribute.<init>(attributes.scala:32)
at org.apache.spark.ml.attribute.NumericAttribute.<init>(attributes.scala:201)
at org.apache.spark.ml.attribute.NumericAttribute.copy(attributes.scala:272)
at org.apache.spark.ml.attribute.NumericAttribute.withName(attributes.scala:212)
at org.apache.spark.ml.feature.VectorAssembler$$anonfun$2.apply(VectorAssembler.scala:111)
at org.apache.spark.ml.feature.VectorAssembler$$anonfun$2.apply(VectorAssembler.scala:98)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at org.apache.spark.ml.feature.VectorAssembler.transform(VectorAssembler.scala:98)
I have replaced na in categorical columns before pipeline using
new_df1 = new_df1.na.fill(
"others", ["occupation", "b2c", "ac_type", "gender", "incomedetails", "poa_status"]
)
Also I have run this entire code without considering categorical columns at all, then also same results
question from:
https://stackoverflow.com/questions/65935774/how-to-solve-cannot-have-an-empty-string-for-name-in-pyspark 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…