您现在的位置是：首页 > 其他

当前栏目

Spark算子执行流程详解之三大数据

流程执行数据 Spark 详解之三算子

2023-06-13 09:20:26 时间

用与聚合RDD中的元素，先使用seqOp将RDD中每个分区中的T类型元素聚合成U类型，再使用combOp将之前每个分区聚合后的U类型聚合成U类型，特别注意seqOp和combOp都会使用zeroValue的值，zeroValue的类型为U，

def aggregate[U: ClassTag](zeroValue:U)(seqOp: (U,T) = U, combOp: (U,U) = U): U = withScope {
// Clone the zero value since we will also be serializing it as part of tasks
var jobResult = Utils.clone(zeroValue, sc.env.serializer.newInstance())
val cleanSeqOp = sc.clean(seqOp)
val cleanCombOp = sc.clean(combOp)

// zeroValue即初始值，aggregatePartition是在excutor上执行的

val aggregatePartition = (it:Iterator[T]) = it.aggregate(zeroValue)(cleanSeqOp, cleanCombOp)

// jobResult即初始值，其合并每个分区的结果是在driver端执行的
val mergeResult = (index: Int, taskResult:U) = jobResult = combOp(jobResult, taskResult)

sc.runJob(this, aggregatePartition, mergeResult)
jobResult
}

| {(x : Int,y : Int) = x + y},

| {(a : Int,b : Int) = a + b}

| )

res17: Int = 58

/**
* Aggregate the elements of each partition, and then the results for all the partitions, using a
* given associative and commutative function and a neutral zero value . The function
* op(t1, t2) is allowed to modify t1 and return it as its result value to avoid object
* allocation; however, it should not modify t2.
*
* This behaves somewhat differently from fold operations implemented for non-distributed
* collections in functional languages like Scala. This fold operation may be applied to
* partitions individually, and then fold those results into the final result, rather than
* apply the fold to each element sequentially in some defined ordering. For functions
* that are not commutative, the result may differ from that of a fold applied to a
* non-distributed collection.
*/
def fold(zeroValue: T)(op: (T, T) = T): T= withScope {
// Clone the zero value since we will also be serializing it as part of tasks
var jobResult = Utils.clone(zeroValue, sc.env.closureSerializer.newInstance())
val cleanOp = sc.clean(op)

//先在excutor上针对分区进行一次fold操作
val foldPartition = (iter: Iterator[T]) = iter.fold(zeroValue)(cleanOp)

//然后在driver端合并每个分区上的结果
val mergeResult = (index: Int, taskResult:T) = jobResult = op(jobResult, taskResult)
sc.runJob(this, foldPartition, mergeResult)
jobResult

}

分层进行aggregate，由于aggregate的时候其分区的结算结果是传输到driver端再进行合并的，如果分区比较多，计算结果返回的数据量比较大的话，那么driver端需要缓存大量的中间结果，这样就会加大driver端的计算能力，因此treeAggregate把分区计算结果的合并仍旧放在excutor端进行，将结果在excutor端不断合并缩小返回driver的数据量，最后再driver端进行最后一次合并。

/**
* Aggregates the elements of this RDD in a multi-level tree pattern.
*
* @param depth suggested depth of the tree (default: 2)
* @see [[org.apache.spark.rdd.RDD#aggregate]]
*/
def treeAggregate[U: ClassTag](zeroValue:U)(
seqOp: (U, T) = U,
combOp: (U, U) = U,
depth: Int = 2): U = withScope {
require(depth = 1, s Depth must be greater than or equal to 1 but got$depth. )
if (partitions.length == 0) {
Utils.clone(zeroValue, context.env.closureSerializer.newInstance())
} else {
val cleanSeqOp = context.clean(seqOp)
val cleanCombOp = context.clean(combOp)

//针对初始分区的聚合函数
val aggregatePartition =
(it: Iterator[T]) = it.aggregate(zeroValue)(cleanSeqOp, cleanCombOp)

//针对初始的各分区先进行部分聚合
var partiallyAggregated = mapPartitions(it = Iterator(aggregatePartition(it)))
var numPartitions = partiallyAggregated.partitions.length

//根据传入的depth计算出需要迭代计算的程度
val scale = math.max(math.ceil(math.pow(numPartitions,1.0 / depth)).toInt, 2)
// If creating an extra level doesn t help reduce
// the wall-clock time, we stop tree aggregation.
while (numPartitions scale + numPartitions / scale) {//计算迭代的程度
numPartitions /= scale
val curNumPartitions = numPartitions

//减少分区个数，合并部分分区的结果
partiallyAggregated = partiallyAggregated.mapPartitionsWithIndex {
(i, iter) = iter.map((i % curNumPartitions, _))
}.reduceByKey(new HashPartitioner(curNumPartitions), cleanCombOp).values
}

//执行最后一次reduce，返回最终结果
partiallyAggregated.reduce(cleanCombOp)
}
}

comb: (a: Int, b: Int)Int

val z =sc.parallelize(List(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18),9)

scala z.treeAggregate(0)(seq,comb,2)

res1: Int = 171

RDD中元素前两个传给输入函数，产生一个新的return值，新产生的return值与RDD中下一个元素（第三个元素）组成两个元素，再被传给输入函数，直到最后只有一个值为止。

/**
* Reduces the elements of this RDD using the specified commutative and
* associative binary operator.
*/
def reduce(f: (T,T) = T): T = withScope {
val cleanF = sc.clean(f)

//定义一个遍历partition的函数，这是在excutor端执行的
val reducePartition: Iterator[T] = Option[T] = iter = {
if (iter.hasNext) {

//reduceLeft从左往后遍历
Some(iter.reduceLeft(cleanF))
} else {
None
}
}
var jobResult: Option[T] = None

//定义一个driver端处理分区计算结果的函数，这是在driver端执行的
val mergeResult = (index: Int, taskResult: Option[T]) = {
if (taskResult.isDefined) {
jobResult = jobResult match {
case Some(value) = Some(f(value, taskResult.get))
case None = taskResult
}
}
}
sc.runJob(this, reducePartition, mergeResult)
// Get the final result out of our Option, or throw an exception if the RDD was empty

//将结果返回
jobResult.getOrElse(throw new UnsupportedOperationException( empty collection ))
}

/**
* Returns the max of this RDD as defined by the implicit Ordering[T].
* @return the maximum element of the RDD
* */
def max()(implicitord: Ordering[T]):T = withScope {
this.reduce(ord.max)

}

/**
* Returns the min of this RDD as defined by the implicit Ordering[T].
* @return the maximum element of the RDD
* */
def min()(implicitord: Ordering[T]):T = withScope {
this.reduce(ord.min)

}

/**
* Reduces the elements of this RDD in a multi-level tree pattern.
*
* @param depth suggested depth of the tree (default: 2)
* @see [[org.apache.spark.rdd.RDD#reduce]]
*/
def treeReduce(f: (T,T) = T, depth: Int =2): T = withScope {
require(depth = 1, s Depth must be greater than or equal to 1 but got$depth. )
val cleanF = context.clean(f)

//针对初始分区的reduce函数
val reducePartition: Iterator[T] = Option[T] = iter = {
if (iter.hasNext) {
Some(iter.reduceLeft(cleanF))
} else {
None
}
}

//针对初始的各分区先进行部分reduce
val partiallyReduced = mapPartitions(it = Iterator(reducePartition(it)))
val op: (Option[T], Option[T]) = Option[T] = (c, x) = {
if (c.isDefined x.isDefined) {
Some(cleanF(c.get, x.get))
} else if (c.isDefined) {
c
} else if (x.isDefined) {
x
} else {
None
}
}

//最终调用的还是treeAggregate方法
partiallyReduced.treeAggregate(Option.empty[T])(op, op, depth)
.getOrElse(throw new UnsupportedOperationException( empty collection ))

}

treeReduce函数先是针对每个分区利用scala的reduceLeft函数进行计算；最后，在将局部合并的RDD进行treeAggregate计算，这里的seqOp和combOp一样，初值为空。在实际应用中，可以用treeReduce来代替reduce，主要是用于单个reduce操作开销比较大，而treeReduce可以通过调整深度来控制每次reduce的规模。其具体的执行流程不再详细叙述，可以参考treeAggregate方法。

猜你喜欢

助飞的双翼 | AI 传奇系列之四
php数据库密码的找回的步骤
如何提高 SRE 的影响力
java 保留字段volatile、transient、native、synchronized详解编程语言
SQL Server跨服务器操作数据库的图文方法(LinkedServer)
java中的Integer的toBinaryString()方法实例
号研究Oracle中的用法（oracle中的小于）
构建SQL Server平台实现企业信息可持续管理（sqlserver平台）
pycharm连接不上mysql中的数据库时_python Mysql时间带t
Oracle 链接超时抢救全面解决方案（oracle出现链接超时）
在Linux系统中安装QQ，轻松给你的生活带来更多乐趣（qq安装linux）
jQuery noConflict()方法解决库冲突
深入理解Oracle:必备数据库手册（oracle数据库手册）
Exchange2000系统建设及规划
在某个范围内随机生成一些数据_cut out删除造句
真·重磅研究！32篇论文硬核盘点2022年度AI热点
52岁的周鸿祎，还年轻吗？

zl程序教程

当前栏目

Spark算子执行流程详解之三大数据

相关文章