1.抽取到hdfs成功后,就可以连接hive,创建外表了
建立外表
beeline -u jdbc:hive2://192.168.186.14:10010/default -n hdfs
CREATE external TABLE nj12345.case_info_ex(CASE_SERIAL STRING, CASE_TITLE STRING, CASE_REGISTER STRING, CASE_REGISTERNO STRING, CASE_DATE STRING, CASE_SOURCE STRING, CASE_SOURCE_DETAIL STRING, PHONE_NUMBER STRING, APPLICANT_NAME STRING, APPLICANT_SEX STRING, APPLICANT_AGE STRING, APPLICANT_ID STRING, CASE_TYPE STRING, CASE_ACCORD STRING, CASE_CONTENT STRING, CASE_PROCESS_TYPE STRING, CASE_ISPUBLIC STRING, CASE_ISVISIT STRING, CASE_ISURGENT STRING, CASE_MARK STRING, AREA_CODE STRING, CASE_SERIAL_TURN STRING, TSIGNTIME_BF STRING, TFDBACKTIME_BF STRING, TBACKTIME_BF STRING, RELATE_SERIAL STRING, ROWGUID STRING, OPERATEDATE STRING, CASE_AREA_CODE STRING)row format DELIMITED FIELDS terminated by '\t' location '/tmp/nj12345/case_info/';
2.然后就可以进行查询了,如果想对hive数据进行增删改,那么需要将数据导入到事务表中
建立事务表
CREATE TABLE nj12345.case_info(CASE_SERIAL STRING, CASE_TITLE STRING, CASE_REGISTER STRING, CASE_REGISTERNO STRING, CASE_DATE STRING, CASE_SOURCE STRING, CASE_SOURCE_DETAIL STRING, PHONE_NUMBER STRING, APPLICANT_NAME STRING, APPLICANT_SEX STRING, APPLICANT_AGE STRING, APPLICANT_ID STRING, CASE_TYPE STRING, CASE_ACCORD STRING, CASE_CONTENT STRING, CASE_PROCESS_TYPE STRING, CASE_ISPUBLIC STRING, CASE_ISVISIT STRING, CASE_ISURGENT STRING, CASE_MARK STRING, AREA_CODE STRING, CASE_SERIAL_TURN STRING, TSIGNTIME_BF STRING, TFDBACKTIME_BF STRING, TBACKTIME_BF STRING, RELATE_SERIAL STRING, ROWGUID STRING, OPERATEDATE STRING, CASE_AREA_CODE STRING)
clustered by (`CASE_SERIAL`) into 250 buckets STORED AS ORC;
3.外表数据导入到事务表
insert insto orc事务表 select * from 外表;
注意坑:
UPDATE case_result_info SET FINISH_NOTE= REPLACE(REPLACE(FINISH_NOTE, CHAR(10), ''), CHAR(13), '');
如果表中字段中含有换行符,那么不好意思,用上面语句把该字段中的换行符替换掉,再进行HDFS抽取。