思路讲解:
1.获取百家姓和男生名字以及女生名字的资源链接,资源在网上随便找一个都可以
String firstNameStr="https://hanyu.baidu.com/shici/detail?from=kg1&highlight=&pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&srcid=51369";
2.爬取网站的所有数据,以字符串储存数据
String boyName=webcrawling(boynameStr);
3.利用正则表达式获取数据中所需的中文姓氏和名字,根据网站汉字的格式编写正则
ArrayList<String> boyData = getData(boyName, "([\\u4e00-\\u9fa5]{2})[、]", 1);
4.正则表达式获取的数据是四个一组,重新切割获取每一个中文字符
5. 将姓氏和姓名进行拼接,利用HashSet确保拼接的名字唯一
ArrayList<String> data=getName(firstData,boyData,girlData,10,10);
效果图:
全部代码:
import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; public class text01 { public static void main(String[] args) throws IOException { String firstNameStr="https://hanyu.baidu.com/shici/detail?from=kg1&highlight=&pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&srcid=51369"; String boynameStr="https://m.pcbaby.com.cn/mip/baike/qzbd/180747.html"; String girlnameStr="https://zhuanlan.zhihu.com/p/522400912"; String firstName=webcrawling(firstNameStr); ArrayList<String> firstTempData = getData(firstName, "(.{4})[,。]", 1); String boyName=webcrawling(boynameStr); ArrayList<String> boyData = getData(boyName, "([\\u4e00-\\u9fa5]{2})[、]", 1); String girlName=webcrawling(girlnameStr); ArrayList<String> girlData = getData(girlName, "([\\u4e00-\\u9fa5]{1,2})(、)", 1); //因为姓氏是4个字一组获取到的,所以要重新切割获取每一个字 ArrayList<String> firstData=new ArrayList<>(); for (String str : firstTempData) { for (int i = 0; i < str.length(); i++) { char c=str.charAt(i); firstData.add(c+""); } } ArrayList<String> data=getName(firstData,boyData,girlData,10,10); Collections.shuffle(data); System.out.println(data); } public static ArrayList<String> getName( ArrayList<String> firstData,ArrayList<String> boyData,ArrayList<String> girlData,int boycount,int girlcount){ ArrayList<String> arr=new ArrayList<>(); HashSet<String> hs=new HashSet<>(); while (boycount-->0){ Collections.shuffle(firstData); Collections.shuffle(boyData); hs.add(firstData.get(0)+boyData.get(0)); } while (girlcount-->0){ Collections.shuffle(firstData); Collections.shuffle(girlData); hs.add(firstData.get(0)+girlData.get(0)); } for (String s : hs) { arr.add(s); } return arr; } public static String webcrawling(String net) throws IOException { //使用sb拼接爬取到的字符串 StringBuilder sb=new StringBuilder(); //创建一个url地址对象 URL url=new URL(net); //连接上这个地址 URLConnection conn = url.openConnection(); //读取数据,因为数据是中文,使用字符流 InputStreamReader isr=new InputStreamReader(conn.getInputStream()); int ch; while ((ch=isr.read())!=-1){ sb.append((char) ch); } isr.close(); return sb.toString(); } public static ArrayList<String> getData(String str,String regex,int index){ //利用list数组存取最后的数据 ArrayList<String> arr=new ArrayList<>(); //获取正则表达式的规则 Pattern pattern = Pattern.compile(regex); //按照pattern的规则到str中获取数据 Matcher matcher = pattern.matcher(str); while (matcher.find()){ //index是获取正则表达是的第几组,如果是0则是完整数据 String group = matcher.group(index); arr.add(group); } return arr; } }