-
Standalone:在 K8S 啟動一個長期運行的集群,所有 Job 都通過 spark-submit 向這個集群提交
-
Kubernetes Native:通過 spark-submit 直接向 K8S 的 API Server 提交,申請到資源后啟動 Pod 做為 Driver 和 Executor 執行 Job,參考 http://spark.apache.org/docs/2.4.6/running-on-kubernetes.html
-
Spark Operator:安裝 Spark Operator,然后定義 spark-app.yaml,再執行 kubectl apply -f spark-app.yaml,這種申明式 API 和調用方式是 K8S 的典型應用方式,參考
部署案例----Kubernetes Native
下載spark安裝包
下載地址:https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
下載的是帶有hadoop依賴包的軟件。
制作spark鏡像
上傳到服務器,並在服務器上解壓下載的軟件包
tar zxvf spark-2.4.5-bin-hadoop2.7.tgz
#軟連接
ln -s <sparkhome,剛剛解壓的文件目錄路徑> /opt/spark
制作鏡像
鏡像倉庫的名稱可以隨便寫,在使用spark-submit提交job時,指定的是本地的鏡像,並非從遠程倉庫獲取的鏡像
cd /opt/spark
./bin/docker-image-tool.sh -r registry.cn-beijing.aliyuncs.com -t spark-v2.4.5 build
查看本地生成的鏡像
[root@iZ2ze48olpbvnopfiqqk33Z spark]# docker images

提交任務
查看K8S集群信息
[root@iZ2ze48olpbvnopfiqqk33Z spark]# kubectl cluster-info

提交任務
./bin/spark-submit \ --master [K8S集群地址] \ --deploy-mode cluster \ --name spark-pi \ --class org.apache.spark.examples.JavaSparkPi \ --conf spark.executor.instances=5 \ --conf spark.kubernetes.container.image=registry.cn-beijing.aliyuncs.com/spark:spark-v2.4.5 \ local:////opt/spark/examples/jars/spark-examples_2.11-2.4.5.jar
[K8S集群地址] 替換為自己集群的地址 例:k8s://https://192.168.1.136:6443
查看狀態
kubectl get all

查看詳細報錯日志
[root@iZ2ze48olpbvnopfiqqk33Z ~]# kubectl logs pod/spark-pi-1635131060330-driver
21/10/25 03:04:24 INFO SparkContext: Successfully stopped SparkContext
Exception in thread "main" org.apache.spark.SparkException: External scheduler cannot be instantiated
at org.apache.spark.SparkContext$.org$apache$spark$SparkContext$$createTaskScheduler(SparkContext.scala:2794)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:493)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2520)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:935)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:926)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:926)
at org.apache.spark.examples.JavaSparkPi.main(JavaSparkPi.java:37)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:845)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:920)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:929)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: io.fabric8.kubernetes.client.KubernetesClientException: Failure executing: GET at: https://kubernetes.default.svc/api/v1/namespaces/default/pods/spark-pi-1635131060330-driver. Message: Forbidden!Configured service account doesn't have access. Service account may have been revoked. pods "spark-pi-1635131060330-driver" is forbidden: User "system:serviceaccount:default:default" cannot get resource "pods" in API group "" in the namespace "default".
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.requestFailure(OperationSupport.java:510)
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.assertResponseCode(OperationSupport.java:447)
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:413)
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:372)
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleGet(OperationSupport.java:337)
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleGet(OperationSupport.java:318)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.handleGet(BaseOperation.java:833)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.getMandatory(BaseOperation.java:226)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.get(BaseOperation.java:170)
at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator$$anonfun$1.apply(ExecutorPodsAllocator.scala:57)
at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator$$anonfun$1.apply(ExecutorPodsAllocator.scala:55)
at scala.Option.map(Option.scala:146)
at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.<init>(ExecutorPodsAllocator.scala:55)
at org.apache.spark.scheduler.cluster.k8s.KubernetesClusterManager.createSchedulerBackend(KubernetesClusterManager.scala:89)
at org.apache.spark.SparkContext$.org$apache$spark$SparkContext$$createTaskScheduler(SparkContext.scala:2788)
... 19 more
21/10/25 03:09:24 INFO ShutdownHookManager: Shutdown hook called
21/10/25 03:09:24 INFO ShutdownHookManager: Deleting directory /tmp/spark-80b52b4e-a2ca-467d-a60b-a14b8d8ccdba
21/10/25 03:09:24 INFO ShutdownHookManager: Deleting directory /var/data/spark-a8985894-a6f4-488c-9341-a27cffd859ee/spark-9b9abc84-8744-4228-a491-a5f13c747ab7
報錯解決:
1. kubectl create serviceaccount spark
2. kubectl create clusterrolebinding spark-role --clusterrole=edit --serviceaccount=default:spark --namespace=default
3. spark-submit --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark
刪除報錯的任務,重新執行
./bin/spark-submit \ --master [K8S集群地址] \ --deploy-mode cluster \ --name spark-pi \ --class org.apache.spark.examples.JavaSparkPi \ --conf spark.executor.instances=5 \ --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \ --conf spark.kubernetes.container.image=registry.cn-beijing.aliyuncs.com/spark:spark-v2.4.5 \ local:////opt/spark/examples/jars/spark-examples_2.11-2.4.5.jar
Exec報錯

報錯--運行狀態ImagePullBackOff
查看詳細信息
[root@iZ2ze48olpbvnopfiqqk33Z ~]# kubectl describe pod/spark-pi-1635139347481-driver
Name: spark-pi-1635139347481-driver
Namespace: default
Priority: 0
Node: cn-beijing.xx.xx.x.2/xx.xx.x.2
Start Time: Mon, 25 Oct 2021 13:22:28 +0800
Labels: spark-app-selector=spark-622bf6defc67445da34ebb43947473ac
spark-role=driver
Annotations: kubernetes.io/psp: ack.privileged
Status: Running
IP: 172.22.113.222
IPs:
IP: 172.22.113.222
Containers:
spark-kubernetes-driver:
Container ID: docker://8a967fd9711cf2f784cdafda4db4109afd56daf50b35ba482cd7e5d0bbc06d1e
Image: registry.cn-beijing.aliyuncs.com/spark:spark-v2.4.5
Image ID: docker://sha256:75733d0f823832c555bdf4c6412587fe340838db9eeea925b74178f3216b3f2c
Ports: 7078/TCP, 7079/TCP, 4040/TCP
Host Ports: 0/TCP, 0/TCP, 0/TCP
Args:
driver
--properties-file
/opt/spark/conf/spark.properties
--class
org.apache.spark.examples.JavaSparkPi
spark-internal
State: Running
Started: Mon, 25 Oct 2021 13:22:29 +0800
Ready: True
Restart Count: 0
Limits:
memory: 1408Mi
Requests:
cpu: 1
memory: 1408Mi
Environment:
SPARK_DRIVER_BIND_ADDRESS: (v1:status.podIP)
SPARK_LOCAL_DIRS: /var/data/spark-0ba71901-ccc9-4f59-ac8f-b5fbc06509d2
SPARK_CONF_DIR: /opt/spark/conf
Mounts:
/opt/spark/conf from spark-conf-volume (rw)
/var/data/spark-0ba71901-ccc9-4f59-ac8f-b5fbc06509d2 from spark-local-dir-1 (rw)
/var/run/secrets/kubernetes.io/serviceaccount from spark-token-rw6sc (ro)
Conditions:
Type Status
Initialized True
Ready True
ContainersReady True
PodScheduled True
Volumes:
spark-local-dir-1:
Type: EmptyDir (a temporary directory that shares a pod's lifetime)
Medium:
SizeLimit: <unset>
spark-conf-volume:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: spark-pi-1635139347481-driver-conf-map
Optional: false
spark-token-rw6sc:
Type: Secret (a volume populated by a Secret)
SecretName: spark-token-rw6sc
Optional: false
QoS Class: Burstable
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events: <none>
報錯無法拉取鏡像,分析原因鏡像並沒有上傳到鏡像倉庫,執行Executor的node機器並不能獲取到鏡像,因此報錯
解決:
將spark制作生成的鏡像手工傳到各個node機器上,執行spark-submit提交Job時,指定配置參數只從本地獲取鏡像
-
將生成的鏡像導出
docker save -o spark.tar registry.cn-beijing.aliyuncs.com/regis-k/vicky:spark-push-v2.4.5
registry-vpc.cn-beijing.aliyuncs.com/acs/spark:spark-v2.4.5
-
將鏡像分發到各個節點
使用scp分發的鏡像 scp spark.tar shadow@xxx:xx:xx:x:/home/shadow
-
在各個節點將鏡像導入到節點鏡像庫中
[root@iZ2ze48olpbvnopfiqqk2zZ shadow]# docker load < spark.tar
e8b689711f21: Loading layer [==================================================>] 83.86MB/83.86MB
2bf2b8c78141: Loading layer [==================================================>] 5.177MB/5.177MB
f3fd6088fa34: Loading layer [==================================================>] 3.584kB/3.584kB
8138b5fec066: Loading layer [==================================================>] 210.9MB/210.9MB
4f731e722019: Loading layer [==================================================>] 25.31MB/25.31MB
3a022d792160: Loading layer [==================================================>] 241MB/241MB
4ae4c587a876: Loading layer [==================================================>] 73.73kB/73.73kB
04094b33ae8b: Loading layer [==================================================>] 58.88kB/58.88kB
33fe39ef5ffe: Loading layer [==================================================>] 6.144kB/6.144kB
2805f97e9297: Loading layer [==================================================>] 3.942MB/3.942MB
1d877b4ff939: Loading layer [==================================================>] 9.728kB/9.728kB
12ff6a9017ed: Loading layer [==================================================>] 1.016MB/1.016MB
Loaded image: registry-vpc.cn-beijing.aliyuncs.com/acs/spark:spark-v2.4.5
-
重新提交job
./bin/spark-submit \
--master [K8S集群地址] \
--deploy-mode cluster \
--name spark-pi \
--class org.apache.spark.examples.JavaSparkPi \
--conf spark.executor.instances=5 \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
--conf spark.kubernetes.container.image.pullPolicy=Never \
--conf spark.kubernetes.container.image=registry.cn-beijing.aliyuncs.com \
local:////opt/spark/examples/jars/spark-examples_2.11-2.4.5.jar
spark.kubernetes.container.image參數的值就是制作鏡像時使用的鏡像名稱,registry.cn-beijing.aliyuncs.com
spark.kubernetes.container.image.pullPolicy 參數 Never是代表永不從遠程倉庫獲取鏡像源
-
查看job,已經可以正常創建並正常執行程序

-
執行過程日志
[root@iZ2ze48olpbvnopfiqqk33Z spark]# ./bin/spark-submit --master
[K8S集群地址]
--deploy-mode cluster --name spark-pi --class org.apache.spark.examples.JavaSparkPi --conf spark.executor.instances=5 --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark --conf spark.kubernetes.container.image.pullPolicy=Never --conf spark.kubernetes.container.image=registry-vpc.cn-beijing.aliyuncs.com/acs/spark:spark-v2.4.5 local:////opt/spark/examples/jars/spark-examples_2.11-2.4.5.jar
log4j:WARN No appenders could be found for logger (io.fabric8.kubernetes.client.Config).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
21/10/27 14:06:37 INFO LoggingPodStatusWatcherImpl: State changed, new state:
pod name: spark-pi-1635314796749-driver
namespace: default
labels: spark-app-selector -> spark-946d374c98b6426cb38bdc7d44750175, spark-role -> driver
pod uid: bf662da1-335a-4680-b08f-fd52392f0649
creation time: 2021-10-27T06:06:37Z
service account name: spark
volumes: spark-local-dir-1, spark-conf-volume, spark-token-rw6sc
node name: N/A
start time: N/A
container images: N/A
phase: Pending
status: []
21/10/27 14:06:37 INFO LoggingPodStatusWatcherImpl: State changed, new state:
pod name: spark-pi-1635314796749-driver
namespace: default
labels: spark-app-selector -> spark-946d374c98b6426cb38bdc7d44750175, spark-role -> driver
pod uid: bf662da1-335a-4680-b08f-fd52392f0649
creation time: 2021-10-27T06:06:37Z
service account name: spark
volumes: spark-local-dir-1, spark-conf-volume, spark-token-rw6sc
node name: xxxxxxx
start time: N/A
container images: N/A
phase: Pending
status: []
21/10/27 14:06:37 INFO LoggingPodStatusWatcherImpl: State changed, new state:
pod name: spark-pi-1635314796749-driver
namespace: default
labels: spark-app-selector -> spark-946d374c98b6426cb38bdc7d44750175, spark-role -> driver
pod uid: bf662da1-335a-4680-b08f-fd52392f0649
creation time: 2021-10-27T06:06:37Z
service account name: spark
volumes: spark-local-dir-1, spark-conf-volume, spark-token-rw6sc
node name: xxxxxxxx
start time: 2021-10-27T06:06:37Z
container images: registry-vpc.cn-beijing.aliyuncs.com/acs/spark:spark-v2.4.5
phase: Pending
status: [ContainerStatus(containerID=null, image=registry-vpc.cn-beijing.aliyuncs.com/acs/spark:spark-v2.4.5, imageID=, lastState=ContainerState(running=null, terminated=null, waiting=null, additionalProperties={}), name=spark-kubernetes-driver, ready=false, restartCount=0, state=ContainerState(running=null, terminated=null, waiting=ContainerStateWaiting(message=null, reason=ContainerCreating, additionalProperties={}), additionalProperties={}), additionalProperties={started=false})]
21/10/27 14:06:37 INFO Client: Waiting for application spark-pi to finish...
21/10/27 14:06:38 INFO LoggingPodStatusWatcherImpl: State changed, new state:
pod name: spark-pi-1635314796749-driver
namespace: default
labels: spark-app-selector -> spark-946d374c98b6426cb38bdc7d44750175, spark-role -> driver
pod uid: bf662da1-335a-4680-b08f-fd52392f0649
creation time: 2021-10-27T06:06:37Z
service account name: spark
volumes: spark-local-dir-1, spark-conf-volume, spark-token-rw6sc
node name: xxxxxxxx
start time: 2021-10-27T06:06:37Z
container images: registry-vpc.cn-beijing.aliyuncs.com/acs/spark:spark-v2.4.5
phase: Running
status: [ContainerStatus(containerID=docker://db5681643d2a34b56bb5706a965bc732877b77e70162d479deff269340d19f3f, image=registry-vpc.cn-beijing.aliyuncs.com/acs/spark:spark-v2.4.5, imageID=docker-pullable://registry.cn-beijing.aliyuncs.com/regis-k/vicky@sha256:4105a09b45d9648e1a757538c0df2d482e8d58fae752d961a88486ecbbf9f24e, lastState=ContainerState(running=null, terminated=null, waiting=null, additionalProperties={}), name=spark-kubernetes-driver, ready=true, restartCount=0, state=ContainerState(running=ContainerStateRunning(startedAt=2021-10-27T06:06:38Z, additionalProperties={}), terminated=null, waiting=null, additionalProperties={}), additionalProperties={started=true})]
21/10/27 14:06:44 INFO LoggingPodStatusWatcherImpl: State changed, new state:
pod name: spark-pi-1635314796749-driver
namespace: default
labels: spark-app-selector -> spark-946d374c98b6426cb38bdc7d44750175, spark-role -> driver
pod uid: bf662da1-335a-4680-b08f-fd52392f0649
creation time: 2021-10-27T06:06:37Z
service account name: spark
volumes: spark-local-dir-1, spark-conf-volume, spark-token-rw6sc
node name: xxxxxxxxx
start time: 2021-10-27T06:06:37Z
container images: registry-vpc.cn-beijing.aliyuncs.com/acs/spark:spark-v2.4.5
phase: Succeeded
status: [ContainerStatus(containerID=docker://db5681643d2a34b56bb5706a965bc732877b77e70162d479deff269340d19f3f, image=registry-vpc.cn-beijing.aliyuncs.com/acs/spark:spark-v2.4.5, imageID=docker-pullable://registry.cn-beijing.aliyuncs.com/regis-k/vicky@sha256:4105a09b45d9648e1a757538c0df2d482e8d58fae752d961a88486ecbbf9f24e, lastState=ContainerState(running=null, terminated=null, waiting=null, additionalProperties={}), name=spark-kubernetes-driver, ready=false, restartCount=0, state=ContainerState(running=null, terminated