问题描述
同步到ftp的gzip压缩文件无法解压,也无法cat
排查过程
1、配置ftp服务,详情参见手动搭建FTP站点
2、添加ftp数据源
3、脚本模式创建离线同步任务
{ "type": "job", "version": "2.0", "steps": [ { "stepType": "mysql", "parameter": { "indexes": [], "envType": 1, "datasource": "***", "useSpecialSecret": false, "column": [ "col1", "id", "url", "num" ], "tableComment": "", "connection": [ { "datasource": "***", "table": [ "test005" ] } ], "where": "", "splitPk": "col1", "encoding": "UTF-8" }, "name": "Reader", "category": "reader" }, { "stepType": "ftp", "parameter": { "fileName": "test005.csv.gz", "singleFileOutput": true, "nullFormat": "", "dateFormat": "yyyy-MM-dd HH:mm:ss", "compress": "gzip", "column": [ "0", "1", "2", "3" ], "writeMode": "truncate", "fieldDelimiter": ",", "encoding": "UTF-8", "path": "/var/ftp/test", "fieldDelimiterOrigin": ",", "datasource": "ftp", "envType": 1, "fileFormat": "csv" }, "name": "Writer", "category": "writer" }, { "copies": 1, "parameter": { "nodes": [], "edges": [], "groups": [], "version": "2.0" }, "name": "Processor", "category": "processor" } ], "setting": { "errorLimit": { "record": "" }, "locale": "zh", "speed": { "throttle": false, "concurrent": 2 } }, "order": { "hops": [ { "from": "Reader", "to": "Writer" } ] } }
4、生成的文件cat是乱码,无法解压
5、添加如下参数,生成的文件改名去掉后缀即可解压和cat
"fileType": "binary", "singleFileOutput": false
问题原因
ftp协议的问题导致的。ftp协议默认使用ascii通信,ascii通信是对于UTF-8编码数据可能存在序列化反序列化失真,需要用binary协议
解决方案
添加如下参数,对文件改下名字然后解压
"fileType": "binary", "singleFileOutput": false
适用范围
大数据开发治理平台DataWorks