大数据项目实战之新闻话题统计分析

本文涉及的产品
RDS MySQL Serverless 基础系列,0.5-2RCU 50GB
云数据库 RDS MySQL,集群系列 2核4GB
推荐场景:
搭建个人博客
实时计算 Flink 版,5000CU*H 3个月
简介: 前言:本文是一个完整的大数据项目实战,实时|离线统计分析用户的搜索话题,并用JavaEE工程前端界面展示出来。这些指标对网站的精准营销、运营都有极大帮助。架构大致是按照企业标准来的,从日志的采集、转化处理、实时计算、JAVA后台开发、WEB前端展示,一条完整流程线下来,甚至每个节点都用的高可用架构,都考虑了故障转移和容错性。
前言:本文是一个完整的大数据项目实战, 实时|离线 统计分析用户的搜索话题,并用JavaEE工程前端界面展示出来。这些指标对网站的精准营销、运营都有极大帮助。架构大致是按照企业标准来的,从日志的采集、转化处理、实时计算、JAVA后台开发、WEB前端展示,一条完整流程线下来,甚至每个节点都用的高可用架构,都考虑了故障转移和容错性。所用到的框架包括 : Hadoop(HDFS+MapReduce+Yarn)+Flume+KafKa+Hbase+Hive+Spark(SQL、Streaming )+Hive+Mysql+SpringMVC+Mybatis+Websocket+AugularJs+Echarts 。所涉及到的语言包括: JAVA、Scala、Shell  

项目源代码联系邮箱:tangzhi8023@gmail.com
项目架构图:
DIXShBBCCCGEEEIIIYQQQv4zFEoTQgghhBBCCCGE

     一:数据源处理(搜狗实验室获取新闻资源 XML——>TXT:java解析大批量xml文件 代码后贴)

        处理思路:利用SAXReader获取xml文件内容,并构建News实体类以便写入txt文件,然后编写 ReadWebLog类并编写脚本运行在Liunx上模拟新闻搜索日志产生
       Liunx运行jar命令:java -jar 你的上传jar包所在目录  args0 args1
       或Shell脚本命令:
#/bin/bash
echo "start log"
java -jar 你的上传jar包所在目录  args0 args1

代码:
处理搜狗实验室元数据.xml----->txt
package cn.yusys.hotnews.datasource;

import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

/**
* 解析搜狗实验室新闻xml文件为txt文件---->项目数据源
* @author Tangzhi mail:tangzhi8023@gmail.com
* Created on 2018年11月12日
*/
public class MyXMLReader2JDOM {
    public static void main(String[] args) {
        // 获取xml文件读取流
        SAXReader reader = new SAXReader();
        // 设置字符集编码方式
        reader.setEncoding("utf-8");
        Document document;
        Element rootElement;
        List<Element> docList;
        Iterator<Element> iterator;
        // 用于存放节点数据以便后面的写入之news.log
        ArrayList<News> list = new ArrayList<News>();
        // 开始进行读取
        try {
            document = reader.read(new File("D:\\Downloads\\大数据数据源\\news_tensite_xml.smarty.dat"));
            // 得到根节点元素 <docs>...</docs>
            rootElement = document.getRootElement();
            //<doc>...<doc>
            docList = rootElement.elements("doc");
         /*
          * 得到xml具体配置文件信息
          */
            iterator = docList.iterator();
            for (Element e : docList) {
                News news = new News();
                /**
                 * 遍历子节点将具体新闻信息写入txt文件
                 */
                if (e.element("url") != null && !" ".equals(e.element("url"))) {
                    news.setUrl(e.element("url").getStringValue().trim());
                }
                if (e.element("docno") != null && !" ".equals(e.element("docno"))) {
                    news.setDocno(e.element("docno").getStringValue().trim());
                }
                if (e.element("contenttitle") != null && !" ".equals(e.element("contenttitle"))) {
                    news.setContenttitle(e.element("contenttitle").getStringValue().trim());
                }
                if (e.element("content") != null && !" ".equals(e.element("content"))) {
                    news.setContent(e.element("content").getStringValue().trim());
                }
                list.add(news);
            }
            /**
             * 进行写入txt文件
             */
            writwToFile(list);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    /**
     * 写入txt文件(后期当源数据文件过大时进行分片处理)
     * @throws IOException
     */
    public static void writwToFile(List<News> list) throws IOException {
        File file = new File("D:\\Downloads\\大数据数据源\\news2.log");
        BufferedWriter bw = new BufferedWriter(new FileWriter(file));
        if (!file.exists()) {
            try {
                file.createNewFile();
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else {
            for (News news : list) {
                Date date = new Date();
                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                String dateStr = sdf.format(date);
                bw.write("datetime"+"="+dateStr+"|");
                bw.write("url"+"="+news.getUrl()+"|");
                bw.write("docno"+"="+news.getDocno()+"|");
                bw.write("contenttitle"+"="+news.getContenttitle()+"|");
                bw.write("content"+"="+news.getContent());
                bw.write("\n");
                bw.flush();
            }
        }
    }
}
----------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------
package cn.yusys.hotnews.datasource;
/**
*xml解析时新闻实体类
*/
public class News implements Serializable{
     // 实现序列化接口以便多台机器同时解析
   public News () {
       
   }
   
   public News(String url, String docno, String contenttitle, String content) {
    super();
    this.url = url;
    this.docno = docno;
    this.contenttitle = contenttitle;
    this.content = content;
}

String url;
   String docno;
   String contenttitle;
   String content;
public String getUrl() {
    return url;
}
public void setUrl(String url) {
    this.url = url;
}
public String getDocno() {
    return docno;
}
public void setDocno(String docno) {
    this.docno = docno;
}
public String getContenttitle() {
    return contenttitle;
}
public void setContenttitle(String contenttitle) {
    this.contenttitle = contenttitle;
}
public String getContent() {
    return content;
}
public void setContent(String content) {
    this.content = content;
}
   
}
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
运行在Liunx上模拟日志产生并通过flume采集
package cn.yusys.hotnews.datasource;
import java.io.*;
/**
* 模拟日志服务器产生日(从news.log/news1.log中随机切换文件读取数据然后写入日志文件-----》然后使用进行flume采集)
* @author Tangzhi mail:tangzhi8023@gmail.com
* @date 2018年11月12日
*/
public class ReadWebLog {
    public static String readFileName;
    public static String writeFileName;
    public static void main (String[] args) {
         readFileName = args[0];
         writeFileName = args[1];
         readFile(readFileName);
    }
    /**
     * 从new.log/news1.log中随机读取日志信息
     */
    public static void readFile(String fileName){
        try {
            FileInputStream fs = new FileInputStream(fileName);
            // 转换流
            InputStreamReader isr = new InputStreamReader(fs,"utf-8");
            BufferedReader br = new BufferedReader(isr);
            int count = 0;
            while (br.readLine() != null){
                String line = br.readLine();
                count ++;
                // 自定义读取间隔毫秒
                Thread.sleep(1000);
                System.out.println("row:" + count + ">>>>>>>>" + line);
                /**
                 * 写入到指定文件中(与flume配置文件对应)
                 */
                writeFile(writeFileName,line);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

    }
    /**
     * 文件内容的写入
     */
    public static void writeFile (String fileName,String line) {
        try {
            FileOutputStream fs = new FileOutputStream(fileName, true);
            OutputStreamWriter osw = new OutputStreamWriter(fs);
            BufferedWriter bw = new BufferedWriter(osw);
            // 执行文件内容的写入
            bw.write(line);
            bw.write("\n");
            bw.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}


Q&A
Q1:

Java异常: "2 字节的 UTF-8 序列的字节 2 无效。"  

qXkq3cAAAAABJRU5ErkJggg==
A1:利用记事本打开 另行保存编码格式为UTF-8 再 Notepad++(其他编辑器亦可) 用打开即可
Q2 :
在Liunx系统上运行jar时出现找不到主类
A1 :使用IDEA时pom.xml加入以下依赖并在 <mainClass></mainClass>部分写入你类全路径
<build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <transformers>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass></mainClass>
                                </transformer>
                            </transformers>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
Liunx效果图: