wget https://archive.ics.uci.edu/ml/machine-learning-databases/00210/donation.zip
数据清洗
cd /Users/erichan/garden/spark-1.6.0-bin-hadoop2.6/bin
./spark-shell --master local
val data ="/Users/erichan/AliDrive/ml_spark/data/linkage"
val rawblocks = sc.textFile(data)
//rawblocks.count()
//res0: Long = 6552407
//val head = rawblocks.take(10)
val noheader = rawblocks.filter(l => !l.contains("id_1"))
noheader.filter(l => l.contains("cmp_fname_c1")).foreach(println)
//noheader.count()
//res1: Long = 6552396
case class MatchData(id1: Int, id2: Int, scores: Array[Double], matched: Boolean)
def toDouble(s: String) = {
if ("?".equals(s)) Double.NaN else s.toDouble
}
def parse(line: String) = {
val pieces = line.split(',')
val id1 = pieces(0).toInt
val id2 = pieces(1).toInt
val scores = pieces.slice(2, 11).map(toDouble)
val matched = pieces(11).toBoolean
MatchData(id1, id2, scores, matched)
}
val parsed = noheader.map(line => parse(line))
val matchCounts = parsed.map(md => md.matched).countByValue()
val matchCountsSeq = matchCounts.toSeq
import java.lang.Double.isNaN
val stats = (0 until 9).map(i => {
parsed.map(md => md.scores(i)).filter(!isNaN(_)).stats()
})