package jklz; import org.apache.commons.io.FileUtils; import java.io.File; import java.io.IOException; import java.util.*; /** * @Author: JavaPub * @License: https://github.com/Rodert/ https://gitee.com/rodert/ * @Contact: https://javapub.blog.csdn.net/ * @Date: 2022/5/25 16:41 * @Version: 1.0 * @Description: */ public class Q { public static void main(String[] args) throws IOException { Set<String> objects = new HashSet<>(); long count = 0; List<String> urlList = FileUtils.readLines(new File("C:\\Users\\wangshiyu\\Desktop\\url.txt"), "utf8"); for (String url : urlList) { try { int strStartIndex = url.indexOf("http"); int strEndIndex = url.indexOf("/", strStartIndex + 9); String substring = url.substring(strStartIndex, strEndIndex).substring("http".length()); // System.out.println("http" + substring); count++; objects.add("http" + substring); } catch (Exception e) { // System.out.println(url); } } System.out.println(count); System.out.println(objects.size()); FileUtils.writeLines(new File("C:\\Users\\wangshiyu\\Desktop\\list.txt"), objects); } }
package jklz; import org.apache.commons.io.FileUtils; import java.io.File; import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @Author: JavaPub * @License: https://github.com/Rodert/ https://gitee.com/rodert/ * @Contact: https://javapub.blog.csdn.net/ * @Date: 2022/5/25 15:51 * @Version: 1.0 * @Description: */ public class DisHttpUrl { private static final String url = "http://europa.eu.int/eur-lex/lex/LexUriServ/site/en/oj/2005/l_069/l_06920050316en00590063.pdf"; private static final String RE_TOP = "(\\w*\\.?){1}\\.(com.cn|net.cn|gov.cn|org\\.nz|org.cn|com|net|org|gov|cc|biz|info|cn|co)$"; private static final String RE_TOP2 = "(?<=http://|\\.)[^.]*?\\.(ps|ba|gi|qa|sk|ar|is|rs|am|sy|ve|energy|pa|hu|vg|ky|gg|do|gl|in|ee|pl|gr|ie|no|de|uy|kz|pt|bg|zm|md|ro|vn|ly|cu|th|fi|dk|lv|by|at|edu|ae|nl|sd|fi|ua|se|mt|ch|lu|id|kr|it|es|mx|fr|mc|be|si|us|hk|ir|io|or.kr|gob.cu|ru|jp|eu|uk|ca|int|iq|eu.int|com.cn|net.cn|gov.cn|org\\.nz|org.cn|com|net|org|gov|cc|biz|info|cn|co)"; private static final String RE_TOP2_https = "(?<=https://|\\.)[^.]*?\\.(ee|pl|gr|ie|no|de|uy|kz|pt|bg|zm|md|ro|vn|ly|cu|th|fi|dk|lv|by|at|edu|ae|nl|sd|fi|ua|se|mt|ch|lu|id|kr|it|es|mx|fr|mc|be|si|us|hk|ir|io|or.kr|gob.cu|ru|jp|eu|uk|ca|int|iq|eu.int|com.cn|net.cn|gov.cn|org\\.nz|org.cn|com|net|org|gov|cc|biz|info|cn|co)"; public static void main(String[] args) throws IOException { Set<String> objects = new HashSet<>(); Pattern p = Pattern.compile(RE_TOP2, Pattern.CASE_INSENSITIVE); Pattern p2 = Pattern.compile(RE_TOP2_https, Pattern.CASE_INSENSITIVE); List<String> urlList = FileUtils.readLines(new File("C:\\Users\\wangshiyu\\Desktop\\url.txt"), "utf8"); long count = 0; for (String s : urlList) { if (s.contains("http")) { Matcher matcher = p.matcher(s); if (matcher.find()) { String group = matcher.group(); objects.add("http://" + group); // System.out.println(group); // System.out.println("第:" + count++); } else { Matcher matcher2 = p2.matcher(s); if (matcher2.find()) { String group = matcher2.group(); objects.add("https://" + group); // System.out.println("https://" + group); // System.out.println("第:" + count++); } else { System.out.println("##### " + s); } } } } } }