1.从spark history UI定界
在作业主页确定有问题的子作业Job,然后进入该子作业详情查看stage,确定有问题的stage。超长stage的task如下图所示,agent1节点上的编号为41、43和44的executor出现超长task,例如Task 51
2.arthas调试流程
- 进入agent1节点,进入目标executor
[root@agent1 arthas-packaging-3.7.1-bin]# java -jar arthas-boot.jar
[INFO] JAVA_HOME: /opt/client/JDK/jdk1.8.0_372/jre
[INFO] arthas-boot version: 3.7.1
[INFO] Process 2724779 already using port 3658
[INFO] Process 2724779 already using port 8563
[INFO] Found existing java process, please choose one and input the serial number of the process, eg : 1. Then hit ENTER.
* [1]: 2724779 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[2]: 1985694 com.huawei.bigdata.om.acs.security.acs.service.AccountServiceMain
[3]: 2724781 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[4]: 2724773 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[5]: 2251183 com.omm.pms.PMSMain
[6]: 1987920 org.apache.catalina.startup.Bootstrap
[7]: 3515557 com.huawei.bigdata.om.backup.BackupRecoveryPluginProcess
[8]: 2724794 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[9]: 2724797 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[10]: 1986762 com.huawei.bigdata.om.disaster.service.bootstrap.DRBootstrapService
[11]: 2726526 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[12]: 2107959 org.apache.hadoop.hdfs.server.datanode.DataNode
[13]: 3228518 org.apache.hadoop.yarn.server.nodemanager.NodeManager
[14]: 2724784 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[15]: 265949 org.apache.zookeeper.server.quorum.QuorumPeerMain
[16]: 3216682 org.apache.hadoop.util.RunJar
[17]: 2724789 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[18]: 2724788 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[19]: 2025987 org.apache.hadoop.fs.http.server.HttpFSServerWebServer
[20]: 2724791 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[21]: 2706295 com.huawei.cloud.mrs.MetaServiceMain
[22]: 2724790 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[23]: 1974977 com.galaxmanager.iam.ApplicationMain
[24]: 2724809 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[25]: 2724808 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[26]: 1993148 com.huawei.bigdata.om.aos.bootstrap.AosMainProcess
[27]: 2367366 org.apache.spark.deploy.SparkSubmit
[28]: 2348295 com.huawei.bigdata.om.agent.services.NodeAgent
[29]: 2216584 org.apache.hadoop.hdfs.qjournal.server.JournalNode
[30]: 1986099 com.huawei.bigdata.om.controller.bootstrap.ControllerService
[31]: 2726489 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
[32]: 2345431 org.apache.hadoop.hdfs.server.federation.router.DFSRouter
[33]: 2254036 com.omm.fms.service.FmsMain
[34]: 3840130 org.apache.hadoop.util.RunJar
[35]: 2822929 org.apache.ranger.server.tomcat.EmbeddedServer
[36]: 1360232 /opt/huawei/Bigdata/FusionInsight_JobGateway_8.3.0/install/FusionInsight-JobGateway-1.0.0/job-gateway/job-server/job-gateway-api-1.0.0.jar
[37]: 2726484 org.apache.spark.executor.YarnCoarseGrainedExecutorBackend
- 利用arthas查找有问题的线程,查看其堆栈详情,确定线程卡住C++代码处
# 确定有问题的线程task 51.0
[arthas@2724779]$ thread
ID NAME GROUP PRIORITY STATE %CPU DELTA_TIME TIME INTERRUPTED DAEMON
127 arthas-command-execute system 5 RUNNABLE 0.29 0.000 0:0.033 false true
-1 C1 CompilerThread12 - -1 - 0.18 0.000 0:0.613 false true
-1 C1 CompilerThread17 - -1 - 0.08 0.000 0:0.607 false true
-1 VM Periodic Task Thread - -1 - 0.01 0.000 0:0.234 false true
-1 VM Thread - -1 - 0.0 0.000 0:0.260 false true
-1 C2 CompilerThread1 - -1 - 0.0 0.000 0:1.996 false true
-1 C2 CompilerThread5 - -1 - 0.0 0.000 0:0.937 false true
2 Reference Handler system 10 WAITING 0.0 0.000 0:0.026 false true
3 Finalizer system 8 WAITING 0.0 0.000 0:0.028 false true
4 Signal Dispatcher system 9 RUNNABLE 0.0 0.000 0:0.000 false true
116 Attach Listener system 9 RUNNABLE 0.0 0.000 0:0.037 false true
118 arthas-timer system 9 WAITING 0.0 0.000 0:0.000 false true
119 arthas-NettyHttpTelnetBootstrap-3-1 system 5 RUNNABLE 0.0 0.000 0:0.024 false true
120 arthas-NettyWebsocketTtyBootstrap-4-1 system 5 RUNNABLE 0.0 0.000 0:0.000 false true
121 arthas-NettyWebsocketTtyBootstrap-4-2 system 5 RUNNABLE 0.0 0.000 0:0.001 false true
122 arthas-shell-server system 9 TIMED_WAITING 0.0 0.000 0:0.000 false true
123 arthas-session-manager system 9 TIMED_WAITING 0.0 0.000 0:0.000 false true
124 arthas-UserStat system 9 WAITING 0.0 0.000 0:0.000 false true
126 arthas-NettyHttpTelnetBootstrap-3-2 system 5 RUNNABLE 0.0 0.000 0:0.157 false true
1 main main 5 WAITING 0.0 0.000 0:4.177 false false
25 fileExistenceProcess-1 main 5 TIMED_WAITING 0.0 0.000 0:0.010 false true
40 netty-rpc-env-timeout main 5 TIMED_WAITING 0.0 0.000 0:0.006 false true
41 rpc-client-3-1 main 5 RUNNABLE 0.0 0.000 0:0.445 false true
43 dispatcher-BlockManagerEndpoint1 main 5 WAITING 0.0 0.000 0:0.016 false true
44 RemoteBlock-temp-file-clean-thread main 5 TIMED_WAITING 0.0 0.000 0:0.049 false true
45 dispatcher-Executor main 5 WAITING 0.0 0.000 0:0.340 false true
53 shuffle-boss-7-1 main 5 RUNNABLE 0.0 0.000 0:0.007 false true
54 executor-kill-mark-cleanup main 5 TIMED_WAITING 0.0 0.000 0:0.014 false true
55 executor-heartbeater main 5 TIMED_WAITING 0.0 0.000 0:0.281 false true
59 Executor task launch worker for task 51.0 in stage 1.0 (TI main 5 RUNNABLE 0.0 0.000 0:0.991 false true
60 Executor task launch worker for task 141.0 in stage 1.0 (T main 5 RUNNABLE 0.0 0.000 0:0.716 false true
61 Executor task launch worker for task 6.0 in stage 1.0 (TID main 5 RUNNABLE 0.0 0.000 0:0.811 false true
62 Executor task launch worker for task 186.0 in stage 1.0 (T main 5 RUNNABLE 0.0 0.000 0:1.141 false true
63 Executor task launch worker for task 19.1 in stage 1.0 (TI main 5 RUNNABLE 0.0 0.000 0:1.230 false true
64 Executor task launch worker for task 96.0 in stage 1.0 (TI main 5 RUNNABLE 0.0 0.000 0:1.475 false true
#查看task 51.0的堆栈详情,确定线程在initializeReader卡住了
[arthas@2724779]$ thread 59
"Executor task launch worker for task 51.0 in stage 1.0 (TID 2054)" Id=59 RUNNABLE (in native)
at com.huawei.boostkit.scan.jni.OrcColumnarBatchJniReader.initializeReader(Native Method)
at com.huawei.boostkit.spark.jni.OrcColumnarBatchScanReader.initializeReaderJava(OrcColumnarBatchScanReader.java:204)
at org.apache.spark.sql.execution.datasources.orc.OmniOrcColumnarBatchReader.initialize(OmniOrcColumnarBatchReader.java:138)
at org.apache.spark.sql.execution.datasources.orc.OmniOrcFileFormat.$anonfun$buildReaderWithPartitionValues$1(OmniOrcFileFormat.scala:166)
at org.apache.spark.sql.execution.datasources.orc.OmniOrcFileFormat$$Lambda$1125/1756879328.apply(Unknown Source)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:209)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:270)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
at org.apache.spark.sql.execution.BaseColumnarFileSourceScanExec$$anon$1.hasNext(ColumnarFileSourceScanExec.scala:433)
at org.apache.spark.sql.execution.ColumnarFilterExec$$anon$1.hasNext(ColumnarBasicPhysicalOperators.scala:222)
at org.apache.spark.sql.execution.ColumnarBroadcastExchangeExec$$anon$1.hasNext(ColumnarBroadcastExchangeExec.scala:100)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at org.apache.spark.sql.execution.ColumnarBroadcastExchangeExec$$anon$1.foreach(ColumnarBroadcastExchangeExec.scala:98)
at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
at org.apache.spark.sql.execution.ColumnarBroadcastExchangeExec$$anon$1.to(ColumnarBroadcastExchangeExec.scala:98)
at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
at org.apache.spark.sql.execution.ColumnarBroadcastExchangeExec$$anon$1.toBuffer(ColumnarBroadcastExchangeExec.scala:98)
at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
at org.apache.spark.sql.execution.ColumnarBroadcastExchangeExec$$anon$1.toArray(ColumnarBroadcastExchangeExec.scala:98)
at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1020)
at org.apache.spark.rdd.RDD$$Lambda$909/2035059561.apply(Unknown Source)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2472)
at org.apache.spark.SparkContext$$Lambda$910/796133169.apply(Unknown Source)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:136)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:568)
at org.apache.spark.executor.Executor$TaskRunner$$Lambda$676/872206814.apply(Unknown Source)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1645)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:571)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
3.gdb调试流程
# attach到目标进程
root@agent1 ~]# gdb
GNU gdb (GDB) openEuler 9.2-3.oe1
Copyright (C) 2020 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "aarch64-openEuler-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word".
(gdb) attach 2799392
Attaching to process 2799392
[New LWP 2799420]
[New LWP 2799712]
[New LWP 2799728]
[New LWP 2799739]
[New LWP 2799748]
[New LWP 2799755]
[New LWP 2799760]
[New LWP 2799765]
[New LWP 2799770]
[New LWP 2799777]
[New LWP 2799783]
[New LWP 2799790]
[New LWP 2799795]
[New LWP 2799800]
[New LWP 2799805]
[New LWP 2799811]
[New LWP 2799816]
[New LWP 2799822]
[New LWP 2799826]
[New LWP 2799834]
[New LWP 2799839]
# 查找目标线程,例如150~155
(gdb) info threads
Id Target Id Frame
* 1 LWP 2799392 "java" 0x0000fffdaa27992c in ?? () from /usr/lib64/libpthread.so.0
2 LWP 2799420 "java" 0x0000fffdaa27e5a0 in pthread_cond_wait () from /usr/lib64/libpthread.so.0
3 LWP 2799712 "GC task thread#" 0x0000fffdaa27e5a0 in pthread_cond_wait () from /usr/lib64/libpthread.so.0
4 LWP 2799728 "GC task thread#" 0x0000fffdaa27e5a0 in pthread_cond_wait () from /usr/lib64/libpthread.so.0
5 LWP 2799739 "GC task thread#" 0x0000fffdaa27e5a0 in pthread_cond_wait () from /usr/lib64/libpthread.so.0
...省略
146 LWP 2806195 "dispatcher-Exec" 0x0000fffdaa27e5a0 in pthread_cond_wait () from /usr/lib64/libpthread.so.0
147 LWP 2806372 "shuffle-boss-5-" 0x0000fffdaa085234 in epoll_pwait () from /usr/lib64/libc.so.6
148 LWP 2806424 "executor-kill-m" 0x0000fffdaa27e888 in pthread_cond_timedwait () from /usr/lib64/libpthread.so.0
149 LWP 2806435 "executor-heartb" 0x0000fffdaa27e888 in pthread_cond_timedwait () from /usr/lib64/libpthread.so.0
150 LWP 2810300 "Executor task l" 0x0000fffdaa281e50 in ?? () from /usr/lib64/libpthread.so.0
151 LWP 2810306 "Executor task l" 0x0000fffdaa281e7c in ?? () from /usr/lib64/libpthread.so.0
152 LWP 2810318 "Executor task l" 0x0000fffdaa281e50 in ?? () from /usr/lib64/libpthread.so.0
153 LWP 2810329 "Executor task l" 0x0000fffdaa281e50 in ?? () from /usr/lib64/libpthread.so.0
154 LWP 2810332 "Executor task l" 0x0000fffdaa281e50 in ?? () from /usr/lib64/libpthread.so.0
155 LWP 2810337 "Executor task l" 0x0000fffdaa281e50 in ?? () from /usr/lib64/libpthread.so.0
156 LWP 2811034 "shuffle-client-" 0x0000fffdaa085234 in epoll_pwait () from /usr/lib64/libc.so.6
157 LWP 2811346 "shuffle-client-" 0x0000fffdaa085234 in epoll_pwait () from /usr/lib64/libc.so.6
# 查看目标线程的堆栈,即150
(gdb) thread 150
[Switching to thread 150 (LWP 2810300)]
#0 0x0000fffdaa281e50 in ?? () from /usr/lib64/libpthread.so.0
(gdb) bt
#0 0x0000fffdaa281e50 in ?? () from /usr/lib64/libpthread.so.0
#1 0x0000fffdaa27ad68 in pthread_mutex_lock () from /usr/lib64/libpthread.so.0
#2 0x0000ffd3df787f54 in __gthread_mutex_lock (__mutex=0xffd3df87bb08 <fs::mutex_>) at /usr/local/include/c++/8.5.0/aarch64-unknown-linux-gnu/bits/gthr-default.h:748
#3 0x0000ffd3df78813c in std::mutex::lock (this=0xffd3df87bb08 <fs::mutex_>) at /usr/local/include/c++/8.5.0/bits/std_mutex.h:103
#4 0x0000ffd3df7f91c8 in fs::getHdfsFileSystem (host="hacluster", port="-1") at /root/linlong/orz/omnioperator/omniop-native-reader/cpp/src/filesystem/hdfs_filesystem.cpp:120
#5 0x0000ffd3df7f5f5c in orc::HdfsFileInputStreamOverride::HdfsFileInputStreamOverride (this=0xffd9608ee6c0, uri=...) at /root/linlong/orz/omnioperator/omniop-native-reader/cpp/src/orcfile/OrcHdfsFileOverride.cc:40
#6 0x0000ffd3df7f5bd8 in orc::createHdfsFileInputStream (uri=...) at /root/linlong/orz/omnioperator/omniop-native-reader/cpp/src/orcfile/OrcHdfsFileOverride.cc:106
#7 0x0000ffd3df7f5ac0 in orc::readFileOverride (uri=...) at /root/linlong/orz/omnioperator/omniop-native-reader/cpp/src/orcfile/OrcFileOverride.cc:26
#8 0x0000ffd3df768340 in Java_com_huawei_boostkit_scan_jni_OrcColumnarBatchJniReader_initializeReader (env=0xffd6206e4a70, jObj=0xffdfaaebb770, jsonObj=0xffdfaaebb768, vecFildsNames=0xffdfaaebb760)
at /root/linlong/orz/omnioperator/omniop-native-reader/cpp/src/jni/OrcColumnarBatchJniReader.cpp:117