注意:下面的方法在csdn博客改版以后无法使用,因为现在csdn博客不支持metadata api,不知道什么时候可以支持。
1.原文连接
http://hi.baidu.com/cnjsp/blog/item/e175cf1b27bc6af6ae513335.html
2.心得
本方法我测试过,是可以用来的,一则感觉思路挺新颖了,程序员自己写代码解决自己的事情。另一个可以通过这个实例学习一下java,所以我贴出我修改后的java代码。
具体思路可以参见原文。
3.代码
CSDNPost.java
- package cn.mingyuan.baidu2csdn.core;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.Date;
- import java.util.HashMap;
- import java.util.Map;
- import org.apache.xmlrpc.XmlRpcException;
- import org.apache.xmlrpc.client.XmlRpcClient;
- import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
- /**
- * csdn博文
- *
- * @author mingyuanonline@gmail.com
- *
- */
- public class CSDNPost {
- /**
- * 博文创建日期
- */
- private Date dateCreated;
- /**
- * 博文内容
- */
- private String description;
- /**
- * 标题
- */
- private String title;
- /**
- * 博文分类
- */
- private String[] categories;
- public CSDNPost() {
- }
- public CSDNPost(String title, String description, String[] categories,
- Date dateCreated) {
- this.dateCreated = dateCreated;
- this.description = description;
- this.title = title;
- this.categories = categories;
- }
- public Date getDateCreated() {
- return dateCreated;
- }
- public void setDateCreated(Date dateCreated) {
- this.dateCreated = dateCreated;
- }
- public String getDescription() {
- return description;
- }
- public void setDescription(String description) {
- this.description = description;
- }
- public String getTitle() {
- return title;
- }
- public void setTitle(String title) {
- this.title = title;
- }
- public String[] getCategories() {
- return categories;
- }
- public void setCategories(String[] categories) {
- this.categories = categories;
- }
- /**
- * xml-rpc配置
- */
- private static XmlRpcClientConfigImpl config;
- /**
- * xml-rpcClient
- */
- private static XmlRpcClient client;
- static {
- config = new XmlRpcClientConfigImpl();
- try {
- // 此处请将telnetor替换为您的用户名
- config.setServerURL(new URL(
- "http://blog.csdn.net/xw13106209/services/metablogapi.aspx"));
- } catch (MalformedURLException e) {
- System.out.println("请检查url");
- }
- client = new XmlRpcClient();
- client.setConfig(config);
- }
- /**
- * 日志记录
- *
- * @param log
- * log
- */
- private void writelog(String log) {
- FileOutputStream fos = null;
- try {
- fos = new FileOutputStream("post.log", true);
- fos.write((log + "\r\n").getBytes());
- fos.flush();
- fos.close();
- } catch (IOException e) {
- System.out.println("写入日志错误:" + log);
- }
- }
- /**
- * 发布
- */
- public void publish() {
- Map<String, Object> struct = new HashMap<String, Object>();
- struct.put("dateCreated", dateCreated);
- struct.put("description", description);
- struct.put("title", title);
- struct.put("categories", categories);
- // Object[] params = new Object[] { "your usrname",
- // "replace it with your username",
- // "replace it with your password", struct, true };
- Object[] params = new Object[] { "xw13106209",
- "xw13106209",
- "password", struct, true };
- String blogid = null;
- try {
- blogid = (String) client.execute("metaWeblog.newPost", params);
- } catch (XmlRpcException e) {
- writelog("导入出现错误:title=" + title);
- System.out.println("导入出现错误:title=" + title);
- }
- writelog(title + ">> 导入完毕,生成博文id为>>" + blogid);
- System.out.println(title + ">> 导入完毕,生成博文id为>>" + blogid);
- struct.clear();
- }
- public static void main(String[] args) {
- CSDNPost post = new CSDNPost();
- post.publish();
- }
- }
BaiduHi
- package cn.mingyuan.baidu2csdn.core;
- import java.util.Date;
- /**
- * 百度博客
- *
- * @author mingyuanonline@gmail.com
- *
- */
- public class BaiduHi {
- /**
- * 标题
- */
- private String title;
- /**
- * 内容
- */
- private String description;
- /**
- * 分类
- */
- private String categories;
- /**
- * 发布日期
- */
- private Date dateCreated;
- public String getTitle() {
- return title;
- }
- public String getDescription() {
- return description;
- }
- public String getCategories() {
- return categories;
- }
- public Date getDateCreated() {
- return dateCreated;
- }
- public void setTitle(String title) {
- this.title = title;
- }
- public void setDescription(String description) {
- this.description = description;
- }
- public void setCategories(String categories) {
- this.categories = categories;
- }
- public void setDateCreated(Date dateCreated) {
- this.dateCreated = dateCreated;
- }
- public BaiduHi(String title, String description, String categories,
- Date dateCreated) {
- this.title = title;
- this.description = description;
- this.categories = categories;
- this.dateCreated = dateCreated;
- }
- public BaiduHi() {
- // TODO Auto-generated constructor stub
- }
- /**
- * @param args
- */
- public static void main(String[] args) {
- // TODO Auto-generated method stub
- }
- }
BaiduHiFetcher
- package cn.mingyuan.baidu2csdn.core;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.net.URLConnection;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.Stack;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /**
- * 百度博客数据抓取及解析
- *
- * @author mingyuanonline@gmail.com
- *
- */
- public class BaiduHiFetcher {
- /**
- * 下载页面
- *
- * @param url
- * url
- * @return 网页源码
- */
- private String downloadPage(String url) {
- URLConnection conn;
- InputStream in;
- BufferedReader reader = null;
- StringBuilder sb = new StringBuilder();
- String line = null;
- try {
- conn = new URL(url).openConnection();
- in = conn.getInputStream();
- reader = new BufferedReader(new InputStreamReader(in, "gb2312"));
- while ((line = reader.readLine()) != null) {
- sb.append(line);
- }
- in.close();
- reader.close();
- } catch (MalformedURLException e) {
- System.out.println("请检查url是否规范");
- } catch (IOException e) {
- System.out.println("读取源码错误:" + url);
- }
- return sb.toString();
- }
- /**
- * 获取页面博文链接
- *
- * @param html
- * 网页源码
- * @return 页面中的博文链接
- */
- private List<String> getPostLinks(String html) {
- // 分析页面内容,取得页面中的文章链接
- String titleDivRegex = "<div[\\s]class=\"tit\"><a[\\s]href=[^<>]+?target=\"_blank\">.+?</div>";
- Pattern titleDivPattern = Pattern.compile(titleDivRegex);
- Matcher titleDivMatcher = titleDivPattern.matcher(html);
- List<String> posts = new ArrayList<String>();
- while (titleDivMatcher.find()) {
- String div = titleDivMatcher.group();
- String titleUrl = div.substring(div.indexOf("/"), div
- .indexOf("\" target"));
- posts.add("http://hi.baidu.com" + titleUrl);
- }
- return posts;
- }
- /**
- * <p>
- * 获取博客总页数 <br>
- * 我的博客内容有16页,有上一页,下一页,尾页等这样的标志,如果博文少的话可能这些标志不会出现,请修改此方法
- *
- * @param html
- * 源码(最好是第一页)
- * @return 博客总页数
- */
- private int getTotalPages(String html) {
- // 页码
- // <a href="/cnjsp/blog/index/16"
- // mce_href="cnjsp/blog/index/16">[尾页]</a>
- String pageRegex = "<a[\\s]href=\"/cnjsp/blog/index/[\\d][\\d]\">\\[尾页\\]</a>";
- Pattern pagePattern = Pattern.compile(pageRegex);
- Matcher pageMatcher = pagePattern.matcher(html);
- String totalPagesStr = null;
- int pages = 0;
- if (pageMatcher.find()) {
- String pagelink = pageMatcher.group();
- totalPagesStr = pagelink.replaceAll(
- "<a[\\s]href=\"/cnjsp/blog/index/", "").replaceAll(
- "\">\\[尾页\\]</a>", "");
- pages = Integer.parseInt(totalPagesStr);
- }
- return pages;
- }
- /**
- * <p>
- * 获取博客的所有博文的地址 <br>
- * 没有对url进行编码处理,如果博客地址含中文,请对url进行处理
- *
- * @param blogUrl
- * 博客地址
- * @return 所有博文地址,存放于栈中,使用的时候请使用pop方法取出元素,这样可以保证按照最先发表的博文最先处理
- */
- public Stack<String> getAllPostLink(String blogUrl) {
- Stack<String> posts = new Stack<String>();
- // 1.下载第一页
- String firstPageHtml = downloadPage(blogUrl + "/blog/index/0");
- // 2.获取博文总页数
- // int totalPages = getTotalPages(firstPageHtml);
- int totalPages = 2;
- // 3.下载各摘要页
- posts.addAll(getPostLinks(firstPageHtml));
- if (totalPages < 1) {
- return posts;
- }
- for (int i = 1; i <= totalPages; i++) {
- String page = downloadPage(blogUrl + "/blog/index/" + i);
- posts.addAll(getPostLinks(page));
- }
- return posts;
- }
- /**
- * 解析博文,获取标题,发布时间,内容,分类等信息
- *
- * @param postUrl
- * 博文地址
- * @return 封装了博文信息的BaiduHi
- */
- public BaiduHi getBaiduHi(String postUrl) {
- String html = downloadPage(postUrl);
- // /<div class="tit">
- String titleDivRegex = "<div[\\s]id=\"m_blog\"[\\s]class=\"modbox\"[\\s]style=\"overflow-x:hidden;\"><div[\\s]class=\"tit\">.+?</div><div[\\s]class=\"date\">";
- Pattern titleDivPattern = Pattern.compile(titleDivRegex);
- Matcher titleDivMatcher = titleDivPattern.matcher(html);
- String title = null;
- if (titleDivMatcher.find()) {
- title = titleDivMatcher
- .group()
- .replaceAll(
- "<div[\\s]id=\"m_blog\"[\\s]class=\"modbox\"[\\s]style=\"overflow-x:hidden;\"><div[\\s]class=\"tit\">",
- "")
- .replaceAll("</div><div[\\s]class=\"date\">", "").trim();
- }
- String dateDivRegex = "<div[\\s]class=\"date\">.+?</div>";
- Pattern dateDivPattern = Pattern.compile(dateDivRegex);
- Matcher dateMatcher = dateDivPattern.matcher(html);
- String dateStr = null;
- Date postDate = null;
- if (dateMatcher.find()) {
- dateStr = dateMatcher.group().replaceAll(
- "<div[\\s]class=\"date\">", "").replaceAll("</div>", "")
- .trim();
- postDate = getDate(dateStr);
- }
- String textDivRegex = "<div[\\s]id=\"blog_text\"[\\s]class=\"cnt\"[\\s]+>.+?</div>";
- Pattern textDivPattern = Pattern.compile(textDivRegex);
- Matcher textMatcher = textDivPattern.matcher(html);
- String text = null;
- if (textMatcher.find()) {
- text = textMatcher.group().replaceAll(
- "<div[\\s]id=\"blog_text\"[\\s]class=\"cnt\"[\\s]+>", "")
- .replaceAll("</div>", "").trim();
- }
- String categoriesRegex = "title=\"查看该分类中所有文章\">类别:.+?</a>";
- Pattern categoriesDivPattern = Pattern.compile(categoriesRegex);
- Matcher categoriesMatcher = categoriesDivPattern.matcher(html);
- String categories = null;
- if (categoriesMatcher.find()) {
- categories = categoriesMatcher.group().replaceAll(
- "title=\"查看该分类中所有文章\">类别:", "").replaceAll("</a>", "")
- .trim();
- }
- BaiduHi hi = new BaiduHi();
- hi.setTitle(title);
- hi.setDescription(text);
- hi.setCategories(categories);
- hi.setDateCreated(postDate);
- return hi;
- }
- /**
- * 解析博文中的日期格式返回Date类型
- * 日期格式为:2011年07月01日 星期五 下午 01:05
- * @param str
- * 博文中的日期
- * @return Date类型日期
- */
- @SuppressWarnings("deprecation")
- private Date getDate(String str) {
- String yearStr = str.substring(0, str.indexOf("年")).trim();
- String monthStr = str.substring(str.indexOf("年"), str.indexOf("月"))
- .replace("年", "").trim();
- String dayStr = str.substring(str.indexOf("月"), str.indexOf("日"))
- .replace("月", "").trim();
- String timeStr = str.substring(str.indexOf("午")).replace("午", "")
- .trim();
- String hourStr = timeStr.split(":")[0];
- String minutesStr = timeStr.split(":")[1];
- Date date = new Date();
- date.setYear(Integer.parseInt(yearStr) - 1900);
- date.setMonth(Integer.parseInt(monthStr) - 1);
- date.setDate(Integer.parseInt(dayStr));
- if (str.contains("下午")) {
- date.setHours(Integer.parseInt(hourStr) + 12);
- } else {
- date.setHours(Integer.parseInt(hourStr));
- }
- date.setMinutes(Integer.parseInt(minutesStr));
- return date;
- }
- }
Transfer
- package cn.mingyuan.baidu2csdn.core;
- import java.util.Stack;
- /**
- * 搬家
- *
- * @author mingyuanonline@gmail.com
- *
- */
- public class Transfer {
- /**
- * @param args
- */
- public static void main(String[] args) {
- // TODO Auto-generated method stub
- //String postUrl = "http://hi.baidu.com/cnjsp";
- String postUrl = "http://hi.baidu.com/xwdreamer";
- BaiduHiFetcher fetcher = new BaiduHiFetcher();
- Stack<String> urls = null;
- urls = fetcher.getAllPostLink(postUrl);
- while (!urls.isEmpty()) {
- String url = urls.pop();
- BaiduHi hi = null;
- hi = fetcher.getBaiduHi(url);
- CSDNPost post = new CSDNPost();
- post.setTitle(hi.getTitle());
- post.setDescription(hi.getDescription());
- post.setCategories(new String[] { hi.getCategories() });
- post.setDateCreated(hi.getDateCreated());
- post.publish();
- try {
- Thread.sleep(5 * 1000);
- } catch (InterruptedException e) {
- System.out.println("休眠出错");
- }
- }
- }
- }
DeletePostById
- package cn.mingyuan.baidu2csdn.core;
- import java.io.BufferedReader;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.net.MalformedURLException;
- import java.net.URL;
- import org.apache.xmlrpc.XmlRpcException;
- import org.apache.xmlrpc.client.XmlRpcClient;
- import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
- public class DeletePostById {
- private static XmlRpcClientConfigImpl config;
- private static XmlRpcClient client;
- static {
- config = new XmlRpcClientConfigImpl();
- try {
- config.setServerURL(new URL(
- "http://blog.csdn.net/telnetor/services/metablogapi.aspx"));
- } catch (MalformedURLException e) {
- System.out.println("请检查url");
- }
- client = new XmlRpcClient();
- client.setConfig(config);
- }
- /**
- * 删除帖子
- *
- * @param appkey
- * appkey,可以任意,这是一个忽略的值
- * @param postid
- * 帖子id
- * @param username
- * 用户名
- * @param password
- * 密码
- * @param publish
- * 博客在帖子被删除之后是否重新发布
- */
- public static void delete(String appkey, String postid, String username,
- String password, boolean publish) {
- Object[] params = new Object[] { "ignored value", postid, username,
- password, true };
- try {
- client.execute("blogger.deletePost", params);
- } catch (XmlRpcException e) {
- System.out.println("删除出错,postid=" + postid);
- }
- System.out.println(postid + "删除完毕");
- }
- /**
- * @param args
- * @throws InterruptedException
- */
- public static void main(String[] args) throws InterruptedException {
- BufferedReader reader = null;
- String line;
- try {
- reader = new BufferedReader(new InputStreamReader(
- new FileInputStream("content")));
- while ((line = reader.readLine()) != null) {
- line = line.split("生成博文id为:")[1];
- delete("ignored", line, "your username", "your password", true);
- Thread.sleep(1000 * 10);
- }
- } catch (FileNotFoundException e1) {
- System.out.println("文件没找到");
- } catch (IOException e) {
- System.out.println("读取文件失败");
- }
- }
- }
本文转自xwdreamer博客园博客,原文链接:http://www.cnblogs.com/xwdreamer/archive/2011/07/19/2296977.html,如需转载请自行联系原作者