HDFS sink主要处理过程在process方法:
//循环batchSize次或者Channel为空
for(txnEventCount = 0; txnEventCount < batchSize; txnEventCount++) {
//该方法会调用BasicTransactionSemantics的具体实现
Event event = channel.take();
if (event == null) {
break;
}
......
//sfWriter是一个LRU缓存,缓存对文件Handler,最大打开文件由参数maxopenfiles控制
BucketWriter bucketWriter = sfWriters.get(lookupPath);
// 如果不存在,则构造一个缓存
if (bucketWriter == null) {
//通过HDFSWriterFactory根据filetype生成一个hdfswriter,由参数hdfs.Filetype控制;eg:HDFSDataStream
HDFSWriter hdfsWriter = writerFactory.getWriter(fileType);
//idleCallback会在bucketWriter flush完毕后从LRU中删除;
bucketWriter = new BucketWriter(rollInterval, rollSize, rollCount,
batchSize, context, realPath, realName, inUsePrefix, inUseSuffix,
suffix, codeC, compType,hdfsWriter, timedRollerPool,
proxyTicket, sinkCounter, idleTimeout, idleCallback,
lookupPath, callTimeout, callTimeoutPool);
sfWriters.put(lookupPath, bucketWriter);
}
......
// track一个事务内的bucket
if (!writers.contains(bucketWriter)) {
writers.add(bucketWriter);
}
// 写数据到HDFS;
bucketWriter.append(event);->
open();//如果底层支持append,则通过open接口打开;否则create接口
//判断是否进行日志切换
//根据复制的副本书和目标副本数做对比,如果不满足则doRotate=false
if(doRotate) {
close();
open();
}
HDFSWriter.append(event);
if(batchCounter == batchSize) {//如果达到batchSize行进行一次flush
flush();->
doFlush()->
HDFSWriter.sync()->
FSDataoutputStream.flush/sync
}
// 提交事务之前,刷新所有的bucket
for(BucketWriter bucketWriter : writers){
bucketWriter.flush();
}
transaction.commit();
这里,无论是BucketWriter执行append,sync还是rename等操作都是提交到一个后台线程池进行异步处理:callWithTimeout,这个线程池的大小是由hdfs.threadsize来设置;
本文转自MIKE老毕 51CTO博客,原文链接:http://blog.51cto.com/boylook/1298627,如需转载请自行联系原作者