背景:C++的第一个程序是helloworld,但对于spark scala,输出helloworld并没有什么意义,体现不了spark的精髓,所以wordcount才是王道……
话不多说,直接上代码wordcount
(spark本地实现)
本地文件test.txt如下:
spark shell java scala spark shell
import org.apache.spark.{SparkContext, SparkConf}
object spark_test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("WordCount").setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("E:/test.txt") // 读取本地文件
val words = lines.flatMap(_.split(" ")).filter(word => word != " ")
val pairs = words.map(word => (word, 1))
val wordscount = pairs.reduceByKey(_ + _)
wordscount.collect.foreach(println)
//通过sortByKey方法对单词出现的次序进行排序
val wordcount = pairs.reduceByKey(_ + _).map(pair => (pair._2, pair._1)).sortByKey(false).map(pair => (pair._2, pair._1))
wordcount.collect.foreach(println)
sc.stop()
}
}
结果如下:
(scala,1)
(spark,2)
(java,1)
(shell,2)
//通过sortByKey方法对单词出现的次序进行排序
(spark,2)
(shell,2)
(scala,1)
(java,1)