if __name__ == "__main__":
    sc = SparkContext(appName="classification")

    images = sc.binaryFiles("/home/kdh/lessImage/*.jpg")
    imagerdd = images.map(lambda (x,y) : (x, (np.asarray(Image.open(stio.StringIO(y)))) ))

    imagerdd1 = imagerdd.map(lambda (x,y) : (x, get_new_desc(y)))
    imageRddVal = imagerdd1.values()
    imageVal = imageRddVal.take(1)

    print imageVal
    print np.asarray(imageVal).shape

    rdd_val = imagerdd1.values()

    model = PCAmllib(1000).fit(rdd_val)
    transformed = model.transform(rdd_val)

    pcaValue = transformed.value()
    pcaOneVal = pcaValue.take(1)


    print pcaOneVal

我的项目是基于图像特征的分类,所以,我必须使用PCA来减少维度

imagerdd1 = imagerdd.map(lambda (x,y) : (x, get_new_desc(y)))

 x : path 
 y : feature (the number of 3888)

但错误POP !!!!!

错误位置是PCAmllib(1000).fit(rdd_val)错误

16/08/25 01:47:13错误执行程序:阶段1.0(TID 1)中任务0.0的异常net.razorvine.pickle.PickleException:在net.razorvine构造ClassDict(对于numpy.dtype)的预期零参数.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707)at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175)at net.razorvine.pickle.Unpickler.load(Unpickler.java:99)at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112)at org.apache.spark.mllib.api.python.SerDe $$ anonfun $ pythonToJava $ 1 $$ anonfun $ apply $ 2.apply(PythonMLLibAPI.scala:1507)org.apache.spark.mllib.api.python.SerDe $$ anonfun $ pythonToJava $ 1 $$ anonfun $ apply $ 2.apply(PythonMLLibAPI.scala :1506)scala.collection.Iterator $ anon $ 13.hasNext(Iterator.scala:371)at scala.collection.Iterator $$ anon $ 10.hasNext(Iterator.scala:308)at scala.collection.Iterator $ class . foreach(Iterator.scala:727)在scala.collection.AbstractIterator.forea ch(Iterator.scala:1157)at scala.collection.generic.Growable $ class . $ plus $ plus $ eq(Growable.scala:48)at scala.collection.mutable.ArrayBuffer . $ plus $ plus $ eq(ArrayBuffer . scala:103)scala.collection.mlection.ArrayBuffer . $ plus $ plus $ eq(ArrayBuffer.scala:47)scala.collection.TraversableOnce $ class.to(TraversableOnce.scala:273)scala.collection.AbstractIterator . to(Iterator.scala:1157)scala.collection.TraversableOnce $ class.toBuffer(TraversableOnce.scala:265)at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)at scala.collection.TraversableOnce $ class.toArray (TraversableOnce.scala:252)at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)at org.apache.spark.rdd.RDD $$ anonfun $ take $ 1 $$ anonfun $ 28.apply(RDD.scala:1328) )org.apache.spark.rdd.RDD $$ anonfun $在org.apache.spark.SparkContext上获取$ 1 $$ anonfun $ 28.apply(RDD.scala:1328)$$ anonfun $ runJob $ 5.apply(SparkContext.scala :1858)org.apache.spark.SparkContext $$ anonfun $ runJob $ 5.apply(SparkContext.scala:1858)at org .apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)atg.apache.spark.scheduler.Task.run(Task.scala:89)at org.apache.spark.executor.Executor $ TaskRunner.run (Executor.scala:214)位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)的java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:617),位于java.lang.Thread . run(Thread.java:745)16/08/25 01:47:14 WARN TaskSetManager:阶段1.0中丢失的任务0.0(TID 1,localhost):net.razorvine.pickle.PickleException:构造ClassDict的预期零参数(对于numpy.dtype)net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707)net.razorvine.pickle.Unpickler . 发送(Unpickler.java:175)net.razorvine.pickle.Unpickler.load(Unpickler.java:99)at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112)org.apache.spark.mllib .api.python.SerDe $$ anonfun $ pythonToJava $ 1 $$ anonfun $应用$ 2.apply(PythonMLLibAPI.scala:1507)在org.apache.spark.mllib.api.python.SerDe $$ anonfun $ pythonToJava $ 1 $$ anonfun $ apply $ 2.apply(PythonMLLibAPI.scala:1506) at scala.collection.Iterator $$ anon $ 13.hasNext(Iterator.scala:371)at scala.collection.Iterator $$ anon $ 10.hasNext(Iterator.scala:308)at scala.collection.Iterator $ class.foreach(Iterator .scala:727)scala.collection.AbstractIterator.foreach(Iterator.scala:1157)at scala.collection.generic.Growable $ class . $ plus $ plus $ eq(Growable.scala:48)at scala.collection.mutable .ArrayBuffer . $ plus $ plus $ eq(ArrayBuffer.scala:103)at scala.collection.mutable.ArrayBuffer . $ plus $ plus $ eq(ArrayBuffer.scala:47)at scala.collection.TraversableOnce $ class.to(TraversableOnce .scala:273)scala.collection.AbstractIterator.to(Iterator.scala:1157)scala.collection.TraversableOnce $ class.toBuffer(TraversableOnce.scala:265)at scala.collection.AbstractIterator.toBuffer(Iterator.scala: 1157)在scala.collection.TraversableOnce $ class.toArray(TraversableOnc e.scala:252)在scala.collection.AbstractIterator.toArray(Iterator.scala:1157)org.apache.spark.rdd.RDD $$ anonfun $ take $ 1 $$ anonfun $ 28.apply(RDD.scala:1328)在org.apache.spark.rdd.RDD $$ anonfun $拿$ 1 $$ anonfun $ 28.apply(RDD.scala:1328)at atorg.apache.spark.SparkContext $$ anonfun $ runJob $ 5.apply(SparkContext.scala:1858)在org.apache的org.apache.spark.SparkContext $$ anonfun $ runJob $ 5.apply(SparkContext.scala:1858) . spark.scheduler.ResultTask.runTask(ResultTask.scala:66)at org.apache.spark.scheduler.Task.run(Task.scala:89)at org.apache.spark.executor.Executor $ TaskRunner.run(Executor . scala:214)java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)at java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:617)at java.lang.Thread.run(Thread) .java:745)16/08/25 01:47:14错误TaskSetManager:阶段1.0中的任务0失败了1次;中止工作16/08/25 01:47:14 INFO TaskSchedulerImpl:删除任务已完成的TaskSet 1.0,来自池16/08/25 01:47:14 INFO TaskSchedulerImpl:取消阶段1 16/08/25 01: 47:14 INFO DAGScheduler:ResultStage 1(首先在PCA.scala:42)在44.752 s 16/08/25 01:47:14失败INFO DAGScheduler:Job 1失败:首先在PCA.scala:42,花了44.807864 s Traceback (最近一次调用最后一次):文件“/home/kdh/workspace/classification0/src/test.py”,第482行,模型= PCAmllib(2).fit(rdd_val)文件“/home/kdh/spark-1.6 .1-bin-hadoop2.6 / python / pyspark / mllib / feature.py“,第318行,适合jmodel = callMLlibFunc(”fitPCA“,self.k,data)文件”/home/kdh/spark-1.6 . 1-bin-hadoop2.6 / python / pyspark / mllib / common.py“,第130行,在callMLlibFunc中返回callJavaFunc(sc,api,* args)文件”/home/kdh/spark-1.6.1-bin-hadoop2 .6 / python / pyspark / mllib / common.py“,第123行,在callJavaFunc中返回_java2py(sc,func(* args))文件”/home/kdh/spark/python/lib/py4j-0.9-src.zip /py4j/java_gateway.py“,第813行,c所有文件“/home/kdh/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py”,第308行,在get_return_value中py4j.protocol.Py4JJavaError:调用o28.fitPCA时发生错误 . :org.apache.spark.SparkException:作业因阶段失败而中止:阶段1.0中的任务0失败1次,最近失败:阶段1.0中失去的任务0.0(TID 1,localhost):net.razorvine.pickle.PickleException:在net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707)的net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)上构建ClassDict(for numpy.dtype)的预期零参数net.razorvine.pickle.Unpickler.loads(Unpickler.java:112)net.razorvine.pickle.Unpickler.load(Unpickler.java:99)net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175) at org.apache.spark.mllib.api.python.SerDe $$ anonfun $ pythonToJava $ 1 $$ anonfun $ apply $ 2.apply(PythonMLLibAPI.scala:1507)at org.apache.spark.mllib.api.python.SerDe $ $ anonfun $ pythonToJava $ 1 $$ anonfun $ apply $ 2.apply(PythonMLLibAPI.scala:1506)scala.collection.Iterator $$ anon $ 13.hasNext(Iterator.scala:371)at scala.collection.Iterator $$ anon $ 10 . scala.co上的hasNext(Iterator.scala:308) llection.teterator $ class.foreach(Iterator.scala:727)at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)at scala.collection.generic.Growable $ class . $ plus $ plus $ eq(Growable.scala) :48)scala.collection.mutable.ArrayBuffer . $ plus $ plus $ eq(ArrayBuffer.scala:103)scala.collection.mutable.ArrayBuffer . $ plus $ plus $ eq(ArrayBuffer.scala:47)at scala . collection.TraversableOnce $ class.to(TraversableOnce.scala:273)at scala.collection.AbstractIterator.to(Iterator.scala:1157)scala.collection.TraversableOnce $ class.toBuffer(TraversableOnce.scala:265)at scala.collection .AbstractIterator.toBuffer(Iterator.scala:1157)位于org.apache.spark的scala.collection.TraversableOnce $ class.toArray(TraversableOnce.scala:252)scala.collection.AbstractIterator.toArray(Iterator.scala:1157) . rdd.RDD $$ anonfun $在ang.apache.spark.rdd.RDD $ anonfun $ take $ 1 $$ anonfun $ 28.apply(RDD.scala:1328 )org.apache.spark.SparkContext $$ anonfun $ runJob $ 5.apply(SparkContext.scala: 1858)org.apache.spark.SparkContext $$ anonfun $ runJob $ 5.apply(SparkContext.scala:1858)atg.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)at org.apache.spark .scheduler.Task.run(Task.scala:89)at org.apache.spark.executor.Executor $ TaskRunner.run(Executor.scala:214)at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) )java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:617)at java.lang.Thread.run(Thread.java:745)驱动程序stacktrace:at org.apache.spark.scheduler.DAGScheduler.org $ apache $ spark $ scheduler $ DAGScheduler $$ failJobAndIndependentStages(DAGScheduler.scala:1431)at atorg.apache.spark.scheduler.DAGScheduler $$ anonfun $ abortStage $ 1.apply(DAGScheduler.scala:1419)at org.apache.spark.scheduler.DAGScheduler $$ anonfun $ abortStage $ 1.apply(DAGScheduler.scala:1418)at at scala.collection.mutable.ResizableArray $ class.foreach(ResizableArray.scala:59)at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler . scala:1418)org.apache.spark.scheduler.DAGScheduler $$ anonfun $ handleTaskSetFailed $ 1.apply(DAGScheduler.scala:799)at org.apache.spark.scheduler.DAGScheduler $$ anonfun $ handleTaskSetFailed $ 1.apply(DAGScheduler . scala:799)scala.Option.foreach(Option.scala:236)atg.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler) .scala:1640)org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)at org.apache.spark.scheduler.DAGSchedulerEventProces sLoop.onReceive(DAGScheduler.scala:1588)atg.apache.spark.util.EventLoop $$ anon $ 1.run(EventLoop.scala:48)at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala: 620)org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)at org.apache.spark.SparkContext.runJob(SparkContext.scala) :1858)org.apache.spark.rdd.RDD $$ anonfun $在org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:150)at org获取$ 1.apply(RDD.scala:1328) .apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:111)atg.apache.spark.rdd.RDD.withScope(RDD.scala:316)org.apache.spark.rdd.RDD.take( RDD.scala:1302)org.apache.spark.rdd.RDD $$ anonfun $ first $ 1.apply(RDD.scala:1342)at org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:150 )org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:111)atg.apache.spark.rdd.RDD.withScope(RDD.scala:316)at org . 位于org.apache.spark.mllib.api.python的org.apache.spark.mllib.feature.PCA.fit(PCA.scala:42)的apache.spark.rdd.RDD.first(RDD.scala:1341) . 位于sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:)的sun.reflect.NativeMethodAccessorImpl.invoke0(本地方法)中的PythonMLLibAPI.fitPCA(PythonMLLibAPI.scala:634) 43)在py4j.reflection上的py4j.reflection.MethodInvoke.invoke(MethodInvoker.java:231)的java.lang.reflect.Method.invoke(Method.java:497),at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)at at py4j.Gateway.invoke(Gateway.java:259)at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)at py4j.commands.CallCommand.execute(CallCommand.java:79)py4j.GatewayConnection.run(GatewayConnection) .java:209)at java.lang.Thread.run(Thread.java:745)引起:net.razorvine.pickle.PickleException:在网上构造ClassDict(对于numpy.dtype)的预期零参数.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707)at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175) )net.razorvine.pickle.Unpickler.load(Unpickler.java:99)at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112)at org.apache.spark.mllib.api.python.SerDe $ $ anonfun $ pythonToJava $ 1 $$ anonfun $ apply $ 2.apply(PythonMLLibAPI.scala:1507)org.apache.spark.mllib.api.python.SerDe $$ anonfun $ pythonToJava $ 1 $$ anonfun $ apply $ 2.apply(PythonMLLibAPI .scala:1506)at scala.collection.Iterator $$ anon $ 13.hasNext(Iterator.scala:371)at scala.collection.Iterator $$ anon $ 10.hasNext(Iterator.scala:308)at scala.collection.Iterator $ class.foreach(Iterator.scala:727)at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)at scala.collection.generic.Growable $ class . $ plus $ plus $ eq(Growable.scala:48)at at scala.collection.mutable.ArrayBuffer . $ plus $ plus $ eq(ArrayBuffer.scala:103)scala.collectio n.mutable.ArrayBuffer . $ plus $ plus $ eq(ArrayBuffer.scala:47)at scala.collection.TraversableOnce $ class.to(TraversableOnce.scala:273)at scala.collection.AbstractIterator.to(Iterator.scala:1157) )scala.collection.TraversableOnce $ class.toBuffer(TraversableOnce.scala:265)at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)at scala.collection.TraversableOnce $ class.toArray(TraversableOnce.scala:252)在scala.collection.AbstractIterator.toArray(Iterator.scala:1157)atorg.apache.spark.rdd.RDD $$ anonfun $在ang.apache.spark.rdd.RDD获取$ 1 $$ anonfun $ 28.apply(RDD.scala:1328)$$ anonfun $ take $ 1 $$ anonfun $ 28.apply (RDD.scala:1328)org.apache.spark.SparkContext $$ anonfun $ runJob $ 5.apply(SparkContext.scala:1858)at org.apache.spark.SparkContext $$ anonfun $ runJob $ 5.apply(SparkContext.scala :1858)org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)at org.apache.spark.scheduler.Task.run(Task.scala:89)at org.apache.spark.executor . 执行者$ TaskRunner.run(Executor.scala:214)在java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:617)..还有1个