pom.xml文件配置
1 <!-- 聲明公有的屬性 --> 2 <properties> 3 <maven.compiler.source>1.8</maven.compiler.source> 4 <maven.compiler.target>1.8</maven.compiler.target> 5 <encoding>UTF-8</encoding> 6 <scala.version>2.11.8</scala.version> 7 <spark.version>2.2.0</spark.version> 8 <hadoop.version>2.7.1</hadoop.version> 9 <scala.compat.version>2.11</scala.compat.version> 10 </properties> 11 <!-- 聲明並引入公有的依賴 --> 12 <dependencies> 13 <dependency> 14 <groupId>org.scala-lang</groupId> 15 <artifactId>scala-library</artifactId> 16 <version>${scala.version}</version> 17 </dependency> 18 <dependency> 19 <groupId>org.apache.spark</groupId> 20 <artifactId>spark-core_2.11</artifactId> 21 <version>${spark.version}</version> 22 </dependency> 23 <dependency> 24 <groupId>org.apache.hadoop</groupId> 25 <artifactId>hadoop-client</artifactId> 26 <version>${hadoop.version}</version> 27 </dependency> 28 </dependencies> 29 30 <!-- 配置構建信息 --> 31 <build> 32 <!-- 資源文件夾 --> 33 <sourceDirectory>src/main/scala</sourceDirectory> 34 <!-- 聲明並引入構建的插件 --> 35 <plugins> 36 <!-- 用於編譯Scala代碼到class --> 37 <plugin> 38 <groupId>net.alchim31.maven</groupId> 39 <artifactId>scala-maven-plugin</artifactId> 40 <version>3.2.2</version> 41 <executions> 42 <execution> 43 <goals> 44 <goal>compile</goal> 45 <goal>testCompile</goal> 46 </goals> 47 <configuration> 48 <args> 49 <arg>-dependencyfile</arg> 50 <arg>${project.build.directory}/.scala_dependencies</arg> 51 </args> 52 </configuration> 53 </execution> 54 </executions> 55 </plugin> 56 <plugin> 57 <!-- 程序打包 --> 58 <groupId>org.apache.maven.plugins</groupId> 59 <artifactId>maven-shade-plugin</artifactId> 60 <version>2.4.3</version> 61 <executions> 62 <execution> 63 <phase>package</phase> 64 <goals> 65 <goal>shade</goal> 66 </goals> 67 <configuration> 68 <!-- 過濾掉以下文件,不打包 :解決包重復引用導致的打包錯誤--> 69 <filters> 70 <filter><artifact>*:*</artifact> 71 <excludes> 72 <exclude>META-INF/*.SF</exclude> 73 <exclude>META-INF/*.DSA</exclude> 74 <exclude>META-INF/*.RSA</exclude> 75 </excludes> 76 </filter> 77 </filters> 78 <transformers> 79 <!-- 打成可執行的jar包 的主方法入口--> 80 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> 81 <mainClass></mainClass> 82 </transformer> 83 </transformers> 84 </configuration> 85 </execution> 86 </executions> 87 </plugin> 88 </plugins> 89 </build>
第一個WordCount
1 package SparkCore_01 2 3 import org.apache.spark.rdd.RDD 4 import org.apache.spark.{SparkConf, SparkContext} 5 6 /** 7 * 第一個Spark程序 8 */ 9 object SparkWordCount { 10 //Spark程序都需要使用main 11 def main(args: Array[String]): Unit = { 12 //0.構建系統環境變量,為了SparkContext加在環境變量所使用 13 /* 14 三個核心方法 15 set(key,value) --> 主要應對的是 環境變量設置 key 環境變量名 value 是具體值 16 setAppName(name) --> 設置程序運行的名稱 17 setMaster(執行方式),如果需要運行本地環境,那么就需要配置SetMaster這個值 18 "local" --> 代表本地模式,相當於啟用一個線程來模擬Spark運行 19 "local[數值]" --> 代表本地模式, 根據數值來決定啟用多少個線程來模擬spark運行 20 ps:數值不能大於當前cpu 核心數 21 "local[*]" --> 代表本地模式 * 相當於是系統空閑多少線程就用多少線程來執行spark程序 22 */ 23 val conf =new SparkConf().setAppName("SparkWordCount").setMaster("local") 24 //1.先構建SparkContext對象,需要對SparkContext對象進行環境配置即將conf對象傳入到SparkContext中 25 val sc = new SparkContext(conf) 26 27 //Spark對數據的處理 28 //1.讀取文件內容,參數是文件路徑(多用於讀取txt或log文件) 29 val lines: RDD[String] = sc.textFile("dir/SparkCore_01/File.txt") 30 //2.對文件中數據進行切分處理 31 val words: RDD[String] = lines.flatMap(_.split(" ")) 32 //3.對單詞進行統計之前,需要對單詞的個數進行計數 33 val tuples: RDD[(String, Int)] = words.map((_,1)) 34 //Spark中提供了一個根據key計算value的算子(這個算子是你使用最廣泛一個算子),相同key為一組計算一次value的值 35 val sumed: RDD[(String, Int)] = tuples.reduceByKey(_+_) 36 37 //println(sumed.collect().toList) 38 39 sc.stop()//關閉Sparkcontext 40 41 42 //提交集群版本(修改位置): 43 //sc.textFile(args(0)) //獲取外部輸入讀取數據路徑 44 //將數據文件存儲到集群(也可以存儲在本地)沒有返回值 45 // sumed.saveAsTextFile(args(1)) // 獲取外部輸入的存儲路徑 ,不要打印語句 46 } 47 }
程序打包提交集群

將jar包上傳到對應節點,然后在Spark安裝目錄下bin目錄下執行以下操作
./spark-submit \
> --class SparkCore_01.SparkWordCount\
> --master spark://hadoop01:7077 \
> --executor-memory 512m \
> --total-executor-cores 2 \
> /root/Spark_1905-1.0-SNAPSHOT.jar hdfs://hadoop01:8020/word.txt hdfs://hadoop01:8020/out2
ps: jar包所在路徑 hdfs集群讀取文件 寫入到hdfs集群中
去掉打印日志
log4j.properties
1 # contributor license agreements. See the NOTICE file distributed with 2 # this work for additional information regarding copyright ownership. 3 # The ASF licenses this file to You under the Apache License, Version 2.0 4 # (the "License"); you may not use this file except in compliance with 5 # the License. You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 # 15 # Set everything to be logged to the console 16 # 修改此處更改顯示信息級別 17 log4j.rootCategory=ERROR, console 18 log4j.appender.console=org.apache.log4j.ConsoleAppender 19 log4j.appender.console.target=System.err 20 log4j.appender.console.layout=org.apache.log4j.PatternLayout 21 log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n

