1.准备环境
- spark on k8s集群
- Hibench
- hdfs集群
2.修改Hibench配置文件
/conf/spark.conf
# Spark home
hibench.spark.home /usr/local/spark-k8s/spark2.4 #spark目录
# Spark master
# standalone mode: spark://xxx:7077
# YARN mode: yarn-client
hibench.spark.master k8s://https://192.168.66.30:6443 #k8s集群
# executor number and cores when running on Yarn
hibench.yarn.executor.num 2 #spark on yarn时的执行pod数目,这里不用管
hibench.yarn.executor.cores 1
spark.kubernetes.container.image hub.atguigu.com/library/spark:v2.4.7 #镜像名称
# executor and driver memory in standalone & YARN mode
spark.executor.memory 500m #每个执行节点分配的内存大小
spark.executor.instances 2 #执行节点数目
spark.driver.memory 1g
# set spark parallelism property according to hibench's parallelism value
spark.default.parallelism ${hibench.default.map.parallelism}
# set spark sql's default shuffle partitions according to hibench's parallelism value
spark.sql.shuffle.partitions ${hibench.default.shuffle.parallelism}
#======================================================
# Spark Streaming
#======================================================
# Spark streaming Batchnterval in millisecond (default 100)
hibench.streambench.spark.batchInterval 100
# Number of nodes that will receive kafka input (default: 4)
hibench.streambench.spark.receiverNumber 4
# Indicate RDD storage level. (default: 2)
# 0 = StorageLevel.MEMORY_ONLY
# 1 = StorageLevel.MEMORY_AND_DISK_SER
# other = StorageLevel.MEMORY_AND_DISK_SER_2
hibench.streambench.spark.storageLevel 2
# indicate whether to test the write ahead log new feature (default: false)
hibench.streambench.spark.enableWAL false
# if testWAL is true, this path to store stream context in hdfs shall be specified. If false, it can be empty (default: /var/tmp)
hibench.streambench.spark.checkpointPath /var/tmp
# whether to use direct approach or not (dafault: true)
hibench.streambench.spark.useDirectMode true
/conf/hadoop
# Hadoop home
hibench.hadoop.home /usr/local/hadoop/hadoop #hadoop的目录
# The path of hadoop executable
hibench.hadoop.executable ${hibench.hadoop.home}/bin/hadoop
# Hadoop configraution directory
hibench.hadoop.configure.dir ${hibench.hadoop.home}/etc/hadoop
# The root HDFS path to store HiBench data
hibench.hdfs.master hdfs://192.168.66.30:9000 #hdfs地址
# Hadoop release provider. Supported value: apache, cdh5, hdp
hibench.hadoop.release apache
/conf/hibench.conf
# Data scale profile. Available value is tiny, small, large, huge, gigantic and bigdata.
# The definition of these profiles can be found in the workload's conf file i.e. conf/workloads/micro/wordcount.conf
hibench.scale.profile small #选择的测试集大小
# Mapper number in hadoop, partition number in Spark
hibench.default.map.parallelism 8 #Spark中的分区号
# Reducer nubmer in hadoop, shuffle partition number in Spark
hibench.default.shuffle.parallelism 8 #在Spark中随机分配分区号
#======================================================
# Report files
#======================================================
# default report formats
hibench.report.formats "%-12s %-10s %-8s %-20s %-20s %-20s %-20s\n"
# default report dir path
hibench.report.dir ${hibench.home}/report
# default report file name
hibench.report.name hibench.report
# input/output format settings. Available formats: Text, Sequence, Null.
sparkbench.inputformat Sequence
sparkbench.outputformat Sequence
# hibench config folder
hibench.configure.dir ${hibench.home}/conf
# default hibench HDFS root
hibench.hdfs.data.dir ${hibench.hdfs.master}/HiBench
# path of hibench jars
hibench.hibench.datatool.dir ${hibench.home}/autogen/target/autogen-7.1.1-jar-with-dependencies.jar
hibench.common.jar ${hibench.home}/common/target/hibench-common-7.1.1-jar-with-dependencies.jar
hibench.sparkbench.jar ${hibench.home}/sparkbench/assembly/target/sparkbench-assembly-7.1.1-dist.jar
hibench.streambench.stormbench.jar ${hibench.home}/stormbench/streaming/target/stormbench-streaming-7.1.1.jar
hibench.streambench.gearpump.jar ${hibench.home}/gearpumpbench/streaming/target/gearpumpbench-streaming-7.1.1-jar-with-dependencies.jar
hibench.streambench.flinkbench.jar ${hibench.home}/flinkbench/streaming/target/flinkbench-streaming-7.1.1-jar-with-dependencies.jar
#======================================================
# workload home/input/ouput path
#======================================================
hibench.hive.home ${hibench.home}/hadoopbench/sql/target/${hibench.hive.release}
hibench.hive.release apache-hive-0.14.0-bin
hibench.hivebench.template.dir ${hibench.home}/hadoopbench/sql/hive_template
hibench.bayes.dir.name.input ${hibench.workload.dir.name.input}
hibench.bayes.dir.name.output ${hibench.workload.dir.name.output}
hibench.mahout.release.apache apache-mahout-distribution-0.11.0
hibench.mahout.release.hdp apache-mahout-distribution-0.11.0
hibench.mahout.release.cdh5 mahout-0.9-cdh5.1.0
hibench.mahout.release ${hibench.mahout.release.${hibench.hadoop.release}}
hibench.mahout.home ${hibench.home}/hadoopbench/mahout/target/${hibench.mahout.release}
hibench.masters.hostnames
hibench.slaves.hostnames
hibench.workload.input
hibench.workload.output
hibench.workload.dir.name.input Input
hibench.workload.dir.name.output Output
hibench.nutch.dir.name.input ${hibench.workload.dir.name.input}
hibench.nutch.dir.name.output ${hibench.workload.dir.name.output}
hibench.nutch.nutchindexing.dir ${hibench.home}/hadoopbench/nutchindexing/
hibench.nutch.release nutch-1.2
hibench.nutch.home ${hibench.home}/hadoopbench/nutchindexing/target/${hibench.nutch.release}
hibench.dfsioe.dir.name.input ${hibench.workload.dir.name.input}
hibench.dfsioe.dir.name.output ${hibench.workload.dir.name.output}
#======================================================
# Streaming General
#======================================================
# Indicate whether in debug mode for correctness verfication (default: false)
hibench.streambench.debugMode false
hibench.streambench.sampleProbability 0.1
hibench.streambench.fixWindowDuration 10000
hibench.streambench.fixWindowSlideStep 10000
#======================================================
# Kafka for streaming benchmarks
#======================================================
hibench.streambench.kafka.home /PATH/TO/YOUR/KAFKA/HOME
# zookeeper host:port of kafka cluster, host1:port1,host2:port2...
hibench.streambench.zkHost
# Kafka broker lists, written in mode host:port,host:port,..
hibench.streambench.kafka.brokerList
hibench.streambench.kafka.consumerGroup HiBench
# number of partitions of generated topic (default 20)
hibench.streambench.kafka.topicPartitions 20
# consumer group of the consumer for kafka (default: HiBench)
hibench.streambench.kafka.consumerGroup HiBench
# Set the starting offset of kafkaConsumer (default: largest)
hibench.streambench.kafka.offsetReset largest
#======================================================
# Data generator for streaming benchmarks
#======================================================
# Interval span in millisecond (default: 50)
hibench.streambench.datagen.intervalSpan 50
# Number of records to generate per interval span (default: 5)
hibench.streambench.datagen.recordsPerInterval 5
# fixed length of record (default: 200)
hibench.streambench.datagen.recordLength 200
# Number of KafkaProducer running on different thread (default: 1)
hibench.streambench.datagen.producerNumber 1
# Total round count of data send (default: -1 means infinity)
hibench.streambench.datagen.totalRounds -1
# Number of total records that will be generated (default: -1 means infinity)
hibench.streambench.datagen.totalRecords -1
# default path to store seed files (default: ${hibench.hdfs.data.dir}/Streaming)
hibench.streambench.datagen.dir ${hibench.hdfs.data.dir}/Streaming
# default path setting for genearate data1 & data2
hibench.streambench.datagen.data1.name Seed
hibench.streambench.datagen.data1.dir ${hibench.streambench.datagen.dir}/${hibench.streambench.datagen.data1.name}
hibench.streambench.datagen.data2_cluster.dir ${hibench.streambench.datagen.dir}/Kmeans/Cluster
hibench.streambench.datagen.data2_samples.dir ${hibench.streambench.datagen.dir}/Kmeans/Samples
#======================================================
# MetricsReader for streaming benchmarks
#======================================================
# Number of sample records for `MetricsReader` (default: 5000000)
hibench.streambench.metricsReader.sampleNum 5000000
# Number of thread for `MetricsReader` (default: 20)
hibench.streambench.metricsReader.threadNum 20
# The dir where stored the report of benchmarks (default: ${hibench.home}/report)
hibench.streambench.metricsReader.outputDir ${hibench.home}/report
conf/workloads/micro/wordcount.conf
#datagen
hibench.wordcount.tiny.datasize 32000
hibench.wordcount.small.datasize 320000000
hibench.wordcount.large.datasize 3200000000
hibench.wordcount.huge.datasize 32000000000
hibench.wordcount.gigantic.datasize 320000000000
hibench.wordcount.bigdata.datasize 1600000000000
hibench.workload.datasize ${hibench.wordcount.${hibench.scale.profile}.datasize}
# export for shell script
hibench.workload.input ${hibench.hdfs.data.dir}/Wordcount/Input
hibench.workload.output ${hibench.hdfs.data.dir}/Wordcount/Output
3.运行准备
bin/workloads/micro/wordcount/prepare/prepare.sh
4.运行
bin/workloads/micro/wordcount/spark/run.sh
运行完成后可以看到report/hibench.report