开发者社区> 问答> 正文

在给出异常的RDD上执行combineByKey转换。Spark转换

社区小助手 2018-12-12 14:13:51 382

我正在尝试使用以下代码生成客户统计信息。这是combineByKey转换。我得到了一个ArrayIndexOutOfBounds异常。想知道原因,但我没有得到任何暗示。

def createComb = (t:Array[String]) => {
val total = t(5).toDouble
val q = t(4).toInt
(total/q, total/q, q, total)}

def mergeValues : ((Double,Double,Int,Double), Array[String]) =>
(Double,Double,Int,Double) =
{case((mx,mn,q,tot),t) =>{
val total = t(5).toDouble
val quan = t(4).toInt
val mxx = scala.math.max(mx, total/q)
val minn = scala.math.min(mn, total/q)
(mxx,minn,quan+q,total+tot)}}

def mergeComb:((Double,Double,Int,Double),(Double,Double,Int,Double)) =>
(Double,Double,Int,Double) =
{ case((mx1,mn1,q1,tot1),(mx2,mn2,q2,tot2)) =>
(scala.math.max(mx1,mx2), scala.math.min(mn1,mn2), q1+q2, tot1+tot2)}

val statsOfCust = productsTotalByKey.combineByKey(createComb, mergeValues, mergeComb, new org.apache.spark.HashPartitioner(productsTotalByKey.partitions.size))
这是在执行上面代码的火花簇上执行RDD后得到的输出。

scala> statsOfCust.first
[Stage 22:> (0 + 1) / 2]18/11/17 21:26:31 WARN TaskSetManager: Lost task 0.0 in stage 22.0 (TID 26, wn01.itversity.com, executor 9): java.lang.ArrayIndexOutOfBoundsException: 5

at $line80.$read

$$ iw $$

iw

$$ iw $$

iw

$$ iw $$

iw

$$ iw $$

iw

$$ anonfun$createComb$1.apply(:24) at $line80.$read $$

iw

$$ iw $$

iw

$$ iw $$

iw

$$ iw $$

iw

$$ iw $$

anonfun$createComb$1.apply(:23)

at org.apache.spark.util.collection.ExternalSorter

$$ anonfun$5.apply(ExternalSorter.scala:189) at org.apache.spark.util.collection.ExternalSorter $$

anonfun$5.apply(ExternalSorter.scala:188)

at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler

$$ failJobAndIndependentStages(DAGScheduler.scala:1599) at org.apache.spark.scheduler.DAGScheduler $$

anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
at org.apache.spark.scheduler.DAGScheduler

$$ anonfun$abortStage$1.apply(DAGScheduler.scala:1586) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586) at org.apache.spark.scheduler.DAGScheduler $$

anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGScheduler

$$ anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831) at at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) at org.apache.spark.rdd.RDD.take(RDD.scala:1331) at org.apache.spark.rdd.RDD $$

anonfun$first$1.apply(RDD.scala:1372)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.first(RDD.scala:1371)
... 49 elided
Caused by: java.lang.ArrayIndexOutOfBoundsException: 5
at $anonfun$createComb$1.apply(:24)
at $anonfun$createComb$1.apply(:23)
at org.apache.spark.util.collection.ExternalSorter

$$ anonfun$5.apply(ExternalSorter.scala:189) at org.apache.spark.util.collection.ExternalSorter $$

anonfun$5.apply(ExternalSorter.scala:188)
at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
java scala apache-spark org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
at org.apache.spark.util.EventLoop

$$ anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074) at org.apache.spark.rdd.RDD $$

anonfun$take$1.apply(RDD.scala:1358)

分布式计算 Java Scala Apache Spark
分享到
取消 提交回答
全部回答(1)
  • 社区小助手
    2019-07-17 23:20:11

    看起来像createComb方法的问题是你假设t数组至少有6个元素。

    这只是一个快速的geuss。如果有帮助,请告诉我。如果没有,我会尝试进一步调查:)

    0 0
+ 订阅

大数据计算实践乐园,近距离学习前沿技术

推荐文章
相似问题