flink 读取kafka 写入带kerberos认证的hive环境

本文涉及的产品
实时计算 Flink 版,5000CU*H 3个月
简介: flink 读取kafka 写入带kerberos认证的hive环境

1:pom.xml引依赖引入

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>demo</artifactId>
        <groupId>com.zyf</groupId>
        <version>1.0</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>
    <artifactId>KafkaFlink1.12.2HiveSink</artifactId>
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <flink.version>1.12.2</flink.version>
        <scala.binary.version>2.11</scala.binary.version>
        <hive.version>2.1.1-cdh6.2.0</hive.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-jdbc</artifactId>
            <version>${hive.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-filesystem_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>3.0.0</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <artifactSet>
                                <excludes>
                                    <exclude>com.google.code.findbugs:jsr305</exclude>
                                    <exclude>org.slf4j:*</exclude>
                                    <exclude>log4j:*</exclude>
                                </excludes>
                            </artifactSet>
                            <filters>
                                <filter>
                                    <!-- Do not copy the signatures in the META-INF folder.
                                    Otherwise, this might cause SecurityExceptions when using the JAR. -->
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <transformers>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>kafkaFlinkHive</mainClass>
                                </transformer>
                            </transformers>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

2:ConfigUtils类文件

import java.io.FileInputStream;
import java.util.Properties;
public class ConfigUtils {
    private Properties producerProp = new Properties();
    private Properties consumerProp = new Properties();
    private Properties commonProp = new Properties();
    public ConfigUtils(String KafkaConfigPath) {
        try {
            Properties prop = new Properties();
            FileInputStream in = new FileInputStream(KafkaConfigPath);
            prop.load(in);
            in.close();
            String keyPrefix;
            String keyValue;
            for (String key : prop.stringPropertyNames()) {
                keyPrefix = key.trim().split("\\.")[0];
                keyValue = key.trim().substring(key.trim().indexOf(".") + 1);
                switch (keyPrefix.toLowerCase()) {
                    case "producer":
                        this.producerProp.put(keyValue, prop.getProperty(key));
                        break;
                    case "consumer":
                        this.consumerProp.put(keyValue, prop.getProperty(key));
                        break;
                    default:
                        this.commonProp.put(key, prop.getProperty(key));
                        break;
                }
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }
    public Properties getProducerProp() {
        return producerProp;
    } 
    public void setProducerProp(Properties producerProp) {
        this.producerProp = producerProp;
    }
    public Properties getConsumerProp() {
        return consumerProp;
    }
    public void setConsumerProp(Properties consumerProp) {
        this.consumerProp = consumerProp;
    }
    public Properties getCommonProp() {
        return commonProp;
    }
    public void setCommonProp(Properties commonProp) {
        this.commonProp = commonProp;
    }
    @Override
    public String toString() {
        return "ConfigUtils{" +
                "producerProp=" + producerProp +
                ", consumerProp=" + consumerProp +
                ", commonProp=" + commonProp +
                '}';
    }
    public static void main(String[] args) {
        System.out.println(new ConfigUtils("d://ConfigUtils.properties").producerProp);
        System.out.println(new ConfigUtils("d://ConfigUtils.properties").consumerProp);
        System.out.println(new ConfigUtils("d://ConfigUtils.properties").commonProp);
    }
}

3:kafkaFlinkHive类文件

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.Encoder;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.utils.MultipleParameterTool;
import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.UserGroupInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
public class kafkaFlinkHive {
    static Logger logger = LoggerFactory.getLogger(kafkaFlinkHive.class);
    public static void main(String[] args) throws Exception {
        final MultipleParameterTool multipleParameterTool = MultipleParameterTool.fromArgs(args);
        if (!multipleParameterTool.has("path")) {
            System.out.println("Error: not exist --path /opt/your.properties");
            System.out.println("Usage: flink run -m yarn-cluster -d /opt/your.jar --path /opt/your.properties");
            System.exit(0);
        }
        ConfigUtils configUtils = new ConfigUtils(multipleParameterTool.get("path"));
        logger.info(configUtils.toString());
        Properties commonProp = configUtils.getCommonProp();
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.getConfig().setUseSnapshotCompression(true);
        env.enableCheckpointing(5000); // create a checkpoint every 5 seconds
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);//default mode
        Properties consumerProp = configUtils.getConsumerProp();
        if (commonProp.containsKey("source.kafka.security.enable") && commonProp.getProperty("source.kafka.security.enable")
                .equalsIgnoreCase("true")) {
            consumerProp.setProperty("security.protocol", "SASL_PLAINTEXT");
            consumerProp.setProperty("sasl.mechanism", "GSSAPI");
            consumerProp.setProperty("sasl.kerberos.service.name", "kafka");
        }
        FlinkKafkaConsumer<String> kafkaConsumer = new FlinkKafkaConsumer<>(commonProp.getProperty("source.kafka.topic"),
                new SimpleStringSchema(), consumerProp);
        kafkaConsumer.setCommitOffsetsOnCheckpoints(true);
        if (commonProp.containsKey("source.kafka.offset.reset") && commonProp.getProperty("source.kafka.offset.reset")
                .equalsIgnoreCase("latest")) {
            kafkaConsumer.setStartFromLatest();
        }
        DataStream<String> sourceDataStream = env.addSource(kafkaConsumer).uid("source_kafka")
                .setParallelism(Integer.valueOf(commonProp.getProperty("source.kafka.parallelism")));
        DataStream<String> convertData = sourceDataStream.map(new MapFunction<String, String>() {
            @Override
            public String map(String value) {
                return value;
            }
        }).uid("convert_data").setParallelism(Integer.valueOf(commonProp.getProperty("convert.format.parallelism")));
        final StreamingFileSink<String> hdfsSink = StreamingFileSink
                .forRowFormat(new Path("hdfs:///tmp"), (Encoder<String>) (element, stream) -> {
                    PrintStream out = new PrintStream(stream);
                    String[] strArr = element.split(",");
                    if (strArr.length == 2 && isInteger(strArr[0])) {
                        out.println(element);
                    } else {
                        logger.error("data format must be : int,string");
                    }
                    if (commonProp.containsKey("sink.hive.security.enable") && commonProp.getProperty("sink.hive.security.enable")
                            .equalsIgnoreCase("true")) {
                        Configuration conf = new Configuration();
                        conf.set("hadoop.security.authentication", "Kerberos");
                        UserGroupInformation.setConfiguration(conf);
                        try {
                            UserGroupInformation.loginUserFromKeytab(commonProp.getProperty("sink.hive.keytab.principal"),
                                    commonProp.getProperty("sink.hive.keytab.path"));
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                    }
                }).withRollingPolicy(DefaultRollingPolicy.create()
                        .withRolloverInterval(TimeUnit.SECONDS.toMillis(Integer.valueOf(commonProp.getProperty("hdfs.rollover.interval.sec"))))
                        .withInactivityInterval(TimeUnit.SECONDS.toMillis(Integer.valueOf(commonProp.getProperty("hdfs.inactivity.interval.sec"))))
                        .withMaxPartSize(1024L * 1024L * 256L).build())
                .build();
        convertData.addSink(hdfsSink).uid("sink_hdfs").setParallelism(Integer.valueOf(commonProp.getProperty("sink.hdfs.parallelism")));
        convertData.addSink(new SinkToHive<>(commonProp)).uid("sink_hive").setParallelism(Integer.valueOf(commonProp.getProperty("sink.hdfs.parallelism")));
        env.execute("Kafka2Flink2Hive kerberos Example");
    }
    public static boolean isInteger(String str) {
        Pattern pattern = Pattern.compile("^[-\\+]?[\\d]*$");
        return pattern.matcher(str).matches();
    }
}

4:SinkToHive类文件

import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.hive.service.cli.HiveSQLException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.Properties;
public class SinkToHive<IN> extends RichSinkFunction<IN> implements CheckpointedFunction {
    private Logger logger = LoggerFactory.getLogger(SinkToHive.class);
    private String url;
    private String tableName;
    private Connection conn = null;
    private PreparedStatement preparedStatement = null;
    SinkToHive(Properties properties) {
        this.url = properties.getProperty("sink.hive.jdbc.url");
        this.tableName = properties.getProperty("sink.hive.table.name");
    }
    @Override
    public void open(Configuration parameters) throws Exception {
        Class.forName("org.apache.hive.jdbc.HiveDriver");
        conn = DriverManager.getConnection(url);
        logger.info("================open");
    }
    @Override
    public void close() throws Exception {
        try {
            if (preparedStatement != null) {
                preparedStatement.close();
            }
            if (conn != null) {
                conn.close();
            }
        } catch (Exception ex) {
            logger.error(ex.getMessage());
        }
    }
    @Override
    public void invoke(IN value, Context context) throws Exception {
        try {
            preparedStatement = conn.prepareStatement("load data inpath '/tmp/2020*/part-*' into table " + tableName);
            preparedStatement.execute();
        } catch (HiveSQLException ex) {
            logger.debug(ex.getMessage());
        }
    }
    @Override
    public void snapshotState(FunctionSnapshotContext context) throws Exception {
    }
    @Override
    public void initializeState(FunctionInitializationContext context) throws Exception {
    }
}

5:FlinkJob_Kafka2Hive.properties配置文件

#Kafka其他配置
source.kafka.topic=kafka_hive
source.kafka.offset.reset=latest
source.kafka.parallelism=1
source.kafka.security.enable=false
###################################### hive配置###############################
convert.format.parallelism=1
sink.hive.table.name=h3c_table
sink.hdfs.parallelism=1
sink.hive.jdbc.url=jdbc:hive2://zyf53.hde.com:2181,zyf55.hde.com:2181,zyf54.hde.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2
hdfs.rollover.interval.sec=1
hdfs.inactivity.interval.sec=1
sink.hive.security.enable=false
sink.hive.keytab.path=/opt/useradmin.keytab
sink.hive.keytab.principal=useradmin

相关文章
|
消息中间件 数据可视化 Kafka
【kafka可视化工具】kafka-eagle在windows环境的下载、安装、启动与访问
【kafka可视化工具】kafka-eagle在windows环境的下载、安装、启动与访问
1039 0
|
SQL 安全 Java
一篇文章彻底理解 HIVE 常见的三种 AUTHENTICATION 认证机制的配置与使用
一篇文章彻底理解 HIVE 常见的三种 AUTHENTICATION 认证机制的配置与使用
|
11天前
|
消息中间件 Java Kafka
【手把手教你Linux环境下快速搭建Kafka集群】内含脚本分发教程,实现一键部署多个Kafka节点
本文介绍了Kafka集群的搭建过程,涵盖从虚拟机安装到集群测试的详细步骤。首先规划了集群架构,包括三台Kafka Broker节点,并说明了分布式环境下的服务进程配置。接着,通过VMware导入模板机并克隆出三台虚拟机(kafka-broker1、kafka-broker2、kafka-broker3),分别设置IP地址和主机名。随后,依次安装JDK、ZooKeeper和Kafka,并配置相应的环境变量与启动脚本,确保各组件能正常运行。最后,通过编写启停脚本简化集群的操作流程,并对集群进行测试,验证其功能完整性。整个过程强调了自动化脚本的应用,提高了部署效率。
【手把手教你Linux环境下快速搭建Kafka集群】内含脚本分发教程,实现一键部署多个Kafka节点
|
5月前
|
消息中间件 Java Kafka
如何在Kafka分布式环境中保证消息的顺序消费?深入剖析Kafka机制,带你一探究竟!
【8月更文挑战第24天】Apache Kafka是一款专为实时数据管道和流处理设计的分布式平台,以其高效的消息发布与订阅功能著称。在分布式环境中确保消息按序消费颇具挑战。本文首先介绍了Kafka通过Topic分区实现消息排序的基本机制,随后详细阐述了几种保证消息顺序性的策略,包括使用单分区Topic、消费者组搭配单分区消费、幂等性生产者以及事务支持等技术手段。最后,通过一个Java示例演示了如何利用Kafka消费者确保消息按序消费的具体实现过程。
185 3
|
6月前
|
SQL 关系型数据库 MySQL
实时计算 Flink版产品使用问题之如何使用Flink SQL连接带有Kerberos认证的Hive
实时计算Flink版作为一种强大的流处理和批处理统一的计算框架,广泛应用于各种需要实时数据处理和分析的场景。实时计算Flink版通常结合SQL接口、DataStream API、以及与上下游数据源和存储系统的丰富连接器,提供了一套全面的解决方案,以应对各种实时计算需求。其低延迟、高吞吐、容错性强的特点,使其成为众多企业和组织实时数据处理首选的技术平台。以下是实时计算Flink版的一些典型使用合集。
|
8月前
|
消息中间件 Kafka Apache
【Kafka专栏】windows搭建Kafka环境 & 详细教程(01)
【Kafka专栏】windows搭建Kafka环境 & 详细教程(01)
503 2
|
8月前
|
消息中间件 Kafka Linux
初尝Kafka(二):搭建环境
初尝Kafka(二):搭建环境
67 1
|
8月前
|
消息中间件 网络协议 Kafka
win10中docker安装kafka环境和操作
win10中docker安装kafka环境和操作
252 0
|
8月前
|
SQL 消息中间件 存储
案例:Flume消费Kafka数据保存Hive
案例:Flume消费Kafka数据保存Hive
241 0
|
消息中间件 分布式计算 资源调度
深度解读flink kerberos认证(含流程图及源码)
深度解读flink kerberos认证(含流程图及源码)
147 0