用FsImage查找hadoop集群小文件

cd /tmp
hdfs dfsadmin -fetchImage ./tmp_meta
hdfs oiv -i ./tmp_meta -o ./fsimage.csv -p Delimited
hdfs dfs -mkdir -p /tmp/hdfs_metadata/fsimage
hdfs dfs -copyFromLocal ./fsimage.csv /tmp/hdfs_metadata/fsimage

drop table if exists hdfs_meta_temp;
create external table hdfs_meta_temp (path string ,
repl int ,
modification_time string ,accesstime string ,
preferredblocksize int ,blockcount double,
filesize double ,nsquota int ,
dsquota int ,
permission string ,username string ,groupname string)row format delimited
fields terminated by '\t' location '/tmp/hdfs_metadata/fsimage/' ;

--将临时表转换为Impala的 Parquet表
drop table hdfs_meta
create table hdfs_meta stored as parquet as
select path,
repl,
cast(concat(modification_time, ' :00') as timestamp) modification_time,
cast(concat(accesstime,':00') as timestamp) accesstime,
preferredblocksize,
blockcount,
filesize,nsquota,dsquota,permission,username,groupname
from hdfs_meta_temp;

instr(path,’/’,1,2)这两个参数主要表示指定统计的HDFS目录以及目录钻取深度，instr()函数中的最后一个参数即为目录钻取深度

SELECT 
--concat('/',split_part(path,'/',2))  basepath,
--concat('/',split_part(path,'/',2),'/',split_part(path,'/',3))  basepath,
--concat('/',split_part(path,'/',2),'/',split_part(path,'/',3),'/',split_part(path,'/',4))  basepath,
concat('/',split_part(path,'/',2),'/',split_part(path,'/',3),'/',split_part(path,'/',4),'/',split_part(path,'/',5))  basepath,
sum(blockcount) blockcounts,
cast(sum(filesize)/1024/1024/1024 as decimal(18,2) ) filesizes,
count(*) file_nums,
cast(count(*)/sum(blockcount) as decimal(18,2) ) as avg_block ,
cast(sum(filesize)/count(*)/1024 as decimal(18,2) ) AS avg_filesize
FROM tmp.hdfs_meta 
where instr(path,'/',1,4)>0
--and strleft(path, instr(path,'/',1,4)-1)='/user/hive/warehouse'
GROUP BY basepath  
order by file_nums desc, avg_filesize 
limit 200

总结

如上SQL的统计分析可以看到有三个比较重要的统计指标file_nums、blockcounts和avg_filesize。通过这三个指标进行小文件分析，进行如下分析：

如果file_nums/blockcounts的值越大且avg_filesize越小则说明该HDFS或Hive表的小文件越多。

方法二、

使用Sqoop脚本将Hive元数据中关于Hive库和表的信息抽取的Hive中

sqoop import \
-D mapred.job.queue.name=bf_yarn_pool.development \
--connect "jdbc:mysql://10.0.20.107:3306/baofoo_hive_2" \
--username cs_yangz \
--password *** \
--query 'select c.NAME,c.DB_LOCATION_URI,a.TBL_NAME,a.OWNER,a.TBL_TYPE,b.LOCATION from TBLS a,SDS b,DBS c where a.SD_ID=b.SD_ID and a.DB_ID=c.DB_ID and $CONDITIONS' \
--fields-terminated-by ',' \
--delete-target-dir \
--hive-database default \
--target-dir /tmp/hive_tables_temp \
--hive-import \
--hive-overwrite \
--hive-table hive_tables_temp \
--m 1