Spark Streaming数据清理原因和现象
override def compute(validTime: Time): Option[KafkaRDD[K, V, U, T, R]] = { val untilOffsets = clamp(latestLeaderOffsets(maxRetries)) val rdd = KafkaRDD[K, V, U, T, R](
context.sparkContext, kafkaParams, currentOffsets, untilOffsets, messageHandler) // Report the record number and metadata of this batch interval to InputInfoTracker.
val offsetRanges = currentOffsets.map { case (tp, fo) =>
val uo = untilOffsets(tp) OffsetRange(tp.topic, tp.partition, fo, uo.offset)
} val description = offsetRanges.filter { offsetRange =>
// Don't display empty ranges.
offsetRange.fromOffset != offsetRange.untilOffset
}.map { offsetRange =>
s"topic: ${offsetRange.topic}\tpartition: ${offsetRange.partition}\t" +
s"offsets: ${offsetRange.fromOffset} to ${offsetRange.untilOffset}"
}.mkString("\n") // Copy offsetRanges to immutable.List to prevent from being modified by the user
val metadata = Map( "offsets" -> offsetRanges.toList, StreamInputInfo.METADATA_KEY_DESCRIPTION -> description) val inputInfo = StreamInputInfo(id, rdd.count, metadata)
ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
currentOffsets = untilOffsets.map(kv => kv._1 -> kv._2.offset) Some(rdd)
}
private val timer = new RecurringTimer(clock, ssc.graph.batchDuration.milliseconds, longTime => eventLoop.post(GenerateJobs(new Time(longTime))), "JobGenerator")
而JobGenerator中也有一个EventLoop来周期性的接收消息事件:
/** Processes all events */private def processEvent(event: JobGeneratorEvent) {
logDebug("Got event " + event)
event match { case GenerateJobs(time) => generateJobs(time) case ClearMetadata(time) => clearMetadata(time) case DoCheckpoint(time, clearCheckpointDataLater) => doCheckpoint(time, clearCheckpointDataLater) case ClearCheckpointData(time) => clearCheckpointData(time)
}
}
/** Clear DStream metadata for the given `time`. */private def clearMetadata(time: Time) {
ssc.graph.clearMetadata(time) // If checkpointing is enabled, then checkpoint,
// else mark batch to be fully processed
if (shouldCheckpoint) {
eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = true))
} else { // If checkpointing is not enabled, then delete metadata information about
// received blocks (block data not saved in any case). Otherwise, wait for
// checkpointing of this batch to complete.
val maxRememberDuration = graph.getMaxInputStreamRememberDuration()
jobScheduler.receiverTracker.cleanupOldBlocksAndBatches(time - maxRememberDuration)
jobScheduler.inputInfoTracker.cleanup(time - maxRememberDuration)
markBatchFullyProcessed(time)
}
}
def clearMetadata(time: Time) {
logDebug("Clearing metadata for time " + time) this.synchronized {
outputStreams.foreach(_.clearMetadata(time))
}
logDebug("Cleared old metadata for time " + time)
}
/** * Clear metadata that are older than `rememberDuration` of this DStream. * This is an internal method that should not be called directly. This default * implementation clears the old generated RDDs. Subclasses of DStream may override * this to clear their own metadata along with the generated RDDs. */private[streaming] def clearMetadata(time: Time) { val unpersistData = ssc.conf.getBoolean("spark.streaming.unpersist", true)// rememberDuration记忆周期 查看下RDD是否是oldRDD
val oldRDDs = generatedRDDs.filter(_._1 <= (time - rememberDuration))
logDebug("Clearing references to old RDDs: [" +
oldRDDs.map(x => s"${x._1} -> ${x._2.id}").mkString(", ") + "]")//从generatedRDDs中将key清理掉。
generatedRDDs --= oldRDDs.keys if (unpersistData) {
logDebug("Unpersisting old RDDs: " + oldRDDs.values.map(_.id).mkString(", "))
oldRDDs.values.foreach { rdd =>
rdd.unpersist(false) // Explicitly remove blocks of BlockRDD
rdd match { case b: BlockRDD[_] =>
logInfo("Removing blocks of RDD " + b + " of time " + time)
b.removeBlocks() //清理掉RDD的数据
case _ =>
}
}
}
logDebug("Cleared " + oldRDDs.size + " RDDs that were older than " +
(time - rememberDuration) + ": " + oldRDDs.keys.mkString(", "))//依赖的DStream也需要清理掉。
dependencies.foreach(_.clearMetadata(time))
}
/** * Remove the data blocks that this BlockRDD is made from. NOTE: This is an * irreversible operation, as the data in the blocks cannot be recovered back * once removed. Use it with caution. */private[spark] def removeBlocks() {
blockIds.foreach { blockId =>
sparkContext.env.blockManager.master.removeBlock(blockId)
}
_isValid = false}
回到JobGenerator中的processEvent看看 clearCheckpoint:清除缓存数据
/** Clear DStream checkpoint data for the given `time`. */private def clearCheckpointData(time: Time) {
ssc.graph.clearCheckpointData(time) // All the checkpoint information about which batches have been processed, etc have
// been saved to checkpoints, so its safe to delete block metadata and data WAL files
val maxRememberDuration = graph.getMaxInputStreamRememberDuration()
jobScheduler.receiverTracker.cleanupOldBlocksAndBatches(time - maxRememberDuration)
jobScheduler.inputInfoTracker.cleanup(time - maxRememberDuration)
markBatchFullyProcessed(time)
}
clearCheckpointData:
def clearCheckpointData(time: Time) {
logInfo("Clearing checkpoint data for time " + time) this.synchronized {
outputStreams.foreach(_.clearCheckpointData(time))
}
logInfo("Cleared checkpoint data for time " + time)
}
ClearCheckpointData: 和清除元数据信息一样,还是清除DStream依赖的缓存数据。
private[streaming] def clearCheckpointData(time: Time) {
logDebug("Clearing checkpoint data")
checkpointData.cleanup(time)
dependencies.foreach(_.clearCheckpointData(time))
logDebug("Cleared checkpoint data")
}
DStreamCheckpointData:清除缓存的数据
/** * Cleanup old checkpoint data. This gets called after a checkpoint of `time` has been * written to the checkpoint directory. */def cleanup(time: Time) { // Get the time of the oldest checkpointed RDD that was written as part of the
// checkpoint of `time`
timeToOldestCheckpointFileTime.remove(time) match { case Some(lastCheckpointFileTime) =>
// Find all the checkpointed RDDs (i.e. files) that are older than `lastCheckpointFileTime`
// This is because checkpointed RDDs older than this are not going to be needed
// even after master fails, as the checkpoint data of `time` does not refer to those files
val filesToDelete = timeToCheckpointFile.filter(_._1 < lastCheckpointFileTime)
logDebug("Files to delete:\n" + filesToDelete.mkString(","))
filesToDelete.foreach { case (time, file) =>
try { val path = new Path(file) if (fileSystem == null) {
fileSystem = path.getFileSystem(dstream.ssc.sparkContext.hadoopConfiguration)
}
fileSystem.delete(path, true)
timeToCheckpointFile -= time
logInfo("Deleted checkpoint file '" + file + "' for time " + time)
} catch { case e: Exception =>
logWarning("Error deleting old checkpoint file '" + file + "' for time " + time, e)
fileSystem = null
}
} case None =>
logDebug("Nothing to delete")
}
}
private class JobHandler(job: Job) extends Runnable with Logging { import JobScheduler._
def run() { try { val formattedTime = UIUtils.formatBatchTime(
job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false) val batchUrl = s"/streaming/batch/?id=${job.time.milliseconds}"
val batchLinkText = s"[output operation ${job.outputOpId}, batch time ${formattedTime}]"
ssc.sc.setJobDescription(
s"""Streaming job from <a href="$batchUrl">$batchLinkText</a>""")
ssc.sc.setLocalProperty(BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
ssc.sc.setLocalProperty(OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString) // We need to assign `eventLoop` to a temp variable. Otherwise, because
// `JobScheduler.stop(false)` may set `eventLoop` to null when this method is running, then
// it's possible that when `post` is called, `eventLoop` happens to null.
var _eventLoop = eventLoop if (_eventLoop != null) {
_eventLoop.post(JobStarted(job, clock.getTimeMillis())) // Disable checks for existing output directories in jobs launched by the streaming
// scheduler, since we may need to write output to an existing directory during checkpoint
// recovery; see SPARK-4835 for more details.
PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
job.run()
}
_eventLoop = eventLoop if (_eventLoop != null) {//当Job完成的时候,eventLoop会发消息初始化onReceive
_eventLoop.post(JobCompleted(job, clock.getTimeMillis()))
}
} else { // JobScheduler has been stopped.
}
} finally {
ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, null)
ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, null)
}
}
}
}
EventLoop 的onReceive初始化接收到消息JobCompleted.
def start(): Unit = synchronized { if (eventLoop != null) return // scheduler has already been started
logDebug("Starting JobScheduler")
eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") { override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event) override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
}
eventLoop.start()
processEvent:
private def processEvent(event: JobSchedulerEvent) { try {
event match { case JobStarted(job, startTime) => handleJobStart(job, startTime) case JobCompleted(job, completedTime) => handleJobCompletion(job, completedTime) case ErrorReported(m, e) => handleError(m, e)
}
} catch { case e: Throwable =>
reportError("Error in job scheduler", e)
}
}
private def handleJobCompletion(job: Job, completedTime: Long) { val jobSet = jobSets.get(job.time)
jobSet.handleJobCompletion(job)
job.setEndTime(completedTime)
listenerBus.post(StreamingListenerOutputOperationCompleted(job.toOutputOperationInfo))
logInfo("Finished job " + job.id + " from job set of time " + jobSet.time) if (jobSet.hasCompleted) {
jobSets.remove(jobSet.time)
jobGenerator.onBatchCompletion(jobSet.time)
logInfo("Total delay: %.3f s for time %s (execution: %.3f s)".format(
jobSet.totalDelay / 1000.0, jobSet.time.toString,
jobSet.processingDelay / 1000.0
))
listenerBus.post(StreamingListenerBatchCompleted(jobSet.toBatchInfo))
}
job.result match { case Failure(e) =>
reportError("Error running job " + job, e) case _ =>
}
}
备注:
1、DT大数据梦工厂微信公众号DT_Spark
2、IMF晚8点大数据实战YY直播频道号:68917580
3、新浪微博: http://www.weibo.com/ilovepains
亿速云「云服务器」,即开即用、新一代英特尔至强铂金CPU、三副本存储NVMe SSD云盘,价格低至29元/月。点击查看>>
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。