从百度空间到CSDN——博客搬家源码-阿里云开发者社区

注意：下面的方法在csdn博客改版以后无法使用，因为现在csdn博客不支持metadata api，不知道什么时候可以支持。

1.原文连接

http://hi.baidu.com/cnjsp/blog/item/e175cf1b27bc6af6ae513335.html

2.心得

本方法我测试过，是可以用来的，一则感觉思路挺新颖了，程序员自己写代码解决自己的事情。另一个可以通过这个实例学习一下java，所以我贴出我修改后的java代码。

具体思路可以参见原文。

3.代码

CSDNPost.java

 
   [java] 
   view plain
   copy
   print
   ? 
  
package cn.mingyuan.baidu2csdn.core;  
  
import java.io.FileOutputStream;  
import java.io.IOException;  
import java.net.MalformedURLException;  
import java.net.URL;  
import java.util.Date;  
import java.util.HashMap;  
import java.util.Map;  
import org.apache.xmlrpc.XmlRpcException;  
import org.apache.xmlrpc.client.XmlRpcClient;  
import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;  
  
/** 
 * csdn博文 
 *  
 * @author mingyuanonline@gmail.com 
 *  
 */  
public class CSDNPost {  
    /** 
     * 博文创建日期 
     */  
    private Date dateCreated;  
    /** 
     * 博文内容 
     */  
    private String description;  
    /** 
     * 标题 
     */  
    private String title;  
    /** 
     * 博文分类 
     */  
    private String[] categories;  
  
    public CSDNPost() {  
  
    }  
  
    public CSDNPost(String title, String description, String[] categories,  
            Date dateCreated) {  
        this.dateCreated = dateCreated;  
        this.description = description;  
        this.title = title;  
        this.categories = categories;  
    }  
  
    public Date getDateCreated() {  
        return dateCreated;  
    }  
  
    public void setDateCreated(Date dateCreated) {  
        this.dateCreated = dateCreated;  
    }  
  
    public String getDescription() {  
        return description;  
    }  
  
    public void setDescription(String description) {  
        this.description = description;  
    }  
  
    public String getTitle() {  
        return title;  
    }  
  
    public void setTitle(String title) {  
        this.title = title;  
    }  
  
    public String[] getCategories() {  
        return categories;  
    }  
  
    public void setCategories(String[] categories) {  
        this.categories = categories;  
    }  
  
    /** 
     * xml-rpc配置 
     */  
    private static XmlRpcClientConfigImpl config;  
    /** 
     * xml-rpcClient 
     */  
    private static XmlRpcClient client;  
  
    static {  
        config = new XmlRpcClientConfigImpl();  
        try {  
            // 此处请将telnetor替换为您的用户名  
            config.setServerURL(new URL(  
                    "http://blog.csdn.net/xw13106209/services/metablogapi.aspx"));  
        } catch (MalformedURLException e) {  
            System.out.println("请检查url");  
        }  
        client = new XmlRpcClient();  
        client.setConfig(config);  
    }  
  
    /** 
     * 日志记录 
     *  
     * @param log 
     *            log 
     */  
    private void writelog(String log) {  
        FileOutputStream fos = null;  
        try {  
            fos = new FileOutputStream("post.log", true);  
            fos.write((log + "\r\n").getBytes());  
            fos.flush();  
            fos.close();  
        } catch (IOException e) {  
            System.out.println("写入日志错误：" + log);  
        }  
    }  
  
    /** 
     * 发布 
     */  
    public void publish() {  
        Map<String, Object> struct = new HashMap<String, Object>();  
        struct.put("dateCreated", dateCreated);  
        struct.put("description", description);  
        struct.put("title", title);  
        struct.put("categories", categories);  
//      Object[] params = new Object[] { "your usrname",  
//              "replace it with your username",  
//              "replace it with your password", struct, true };  
          
        Object[] params = new Object[] { "xw13106209",  
        "xw13106209",  
        "password", struct, true };  
          
        String blogid = null;  
        try {  
            blogid = (String) client.execute("metaWeblog.newPost", params);  
        } catch (XmlRpcException e) {  
            writelog("导入出现错误：title=" + title);  
            System.out.println("导入出现错误：title=" + title);  
        }  
        writelog(title + ">> 导入完毕,生成博文id为>>" + blogid);  
        System.out.println(title + ">> 导入完毕,生成博文id为>>" + blogid);  
        struct.clear();  
    }  
  
    public static void main(String[] args) {  
        CSDNPost post = new CSDNPost();  
        post.publish();  
    }  
}  

BaiduHi

 
   [java] 
   view plain
   copy
   print
   ? 
  
package cn.mingyuan.baidu2csdn.core;  
  
import java.util.Date;  
  
/** 
 * 百度博客 
 *  
 * @author mingyuanonline@gmail.com 
 *  
 */  
public class BaiduHi {  
    /** 
     * 标题 
     */  
    private String title;  
    /** 
     * 内容 
     */  
    private String description;  
    /** 
     * 分类 
     */  
    private String categories;  
    /** 
     * 发布日期 
     */  
    private Date dateCreated;  
  
    public String getTitle() {  
        return title;  
    }  
  
    public String getDescription() {  
        return description;  
    }  
  
    public String getCategories() {  
        return categories;  
    }  
  
    public Date getDateCreated() {  
        return dateCreated;  
    }  
  
    public void setTitle(String title) {  
        this.title = title;  
    }  
  
    public void setDescription(String description) {  
        this.description = description;  
    }  
  
    public void setCategories(String categories) {  
        this.categories = categories;  
    }  
  
    public void setDateCreated(Date dateCreated) {  
        this.dateCreated = dateCreated;  
    }  
  
    public BaiduHi(String title, String description, String categories,  
            Date dateCreated) {  
        this.title = title;  
        this.description = description;  
        this.categories = categories;  
        this.dateCreated = dateCreated;  
    }  
  
    public BaiduHi() {  
        // TODO Auto-generated constructor stub  
    }  
  
    /** 
     * @param args 
     */  
    public static void main(String[] args) {  
        // TODO Auto-generated method stub  
    }  
}  

BaiduHiFetcher

 
   [java] 
   view plain
   copy
   print
   ? 
  
package cn.mingyuan.baidu2csdn.core;    
  
import java.io.BufferedReader;  
import java.io.IOException;  
import java.io.InputStream;  
import java.io.InputStreamReader;  
import java.net.MalformedURLException;  
import java.net.URL;  
import java.net.URLConnection;  
import java.util.ArrayList;  
import java.util.Date;  
import java.util.List;  
import java.util.Stack;  
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  
  
/** 
 * 百度博客数据抓取及解析 
 *  
 * @author mingyuanonline@gmail.com 
 *  
 */  
public class BaiduHiFetcher {  
    /** 
     * 下载页面 
     *  
     * @param url 
     *            url 
     * @return 网页源码 
     */  
    private String downloadPage(String url) {  
        URLConnection conn;  
        InputStream in;  
        BufferedReader reader = null;  
        StringBuilder sb = new StringBuilder();  
        String line = null;  
        try {  
            conn = new URL(url).openConnection();  
            in = conn.getInputStream();  
            reader = new BufferedReader(new InputStreamReader(in, "gb2312"));  
            while ((line = reader.readLine()) != null) {  
                sb.append(line);  
            }  
            in.close();  
            reader.close();  
        } catch (MalformedURLException e) {  
            System.out.println("请检查url是否规范");  
        } catch (IOException e) {  
            System.out.println("读取源码错误:" + url);  
        }  
        return sb.toString();  
    }  
  
    /** 
     * 获取页面博文链接 
     *  
     * @param html 
     *            网页源码 
     * @return 页面中的博文链接 
     */  
    private List<String> getPostLinks(String html) {  
        // 分析页面内容，取得页面中的文章链接  
        String titleDivRegex = "<div[\\s]class=\"tit\"><a[\\s]href=[^<>]+?target=\"_blank\">.+?</div>";  
        Pattern titleDivPattern = Pattern.compile(titleDivRegex);  
        Matcher titleDivMatcher = titleDivPattern.matcher(html);  
        List<String> posts = new ArrayList<String>();  
        while (titleDivMatcher.find()) {  
            String div = titleDivMatcher.group();  
            String titleUrl = div.substring(div.indexOf("/"), div  
                    .indexOf("\" target"));  
            posts.add("http://hi.baidu.com" + titleUrl);  
        }  
        return posts;  
    }  
  
    /** 
     * <p> 
     * 获取博客总页数 <br> 
     * 我的博客内容有16页，有上一页，下一页，尾页等这样的标志，如果博文少的话可能这些标志不会出现，请修改此方法 
     *  
     * @param html 
     *            源码（最好是第一页） 
     * @return 博客总页数 
     */  
    private int getTotalPages(String html) {  
        // 页码  
        // <a href="/cnjsp/blog/index/16"  
        // mce_href="cnjsp/blog/index/16">[尾页]</a>  
        String pageRegex = "<a[\\s]href=\"/cnjsp/blog/index/[\\d][\\d]\">\\[尾页\\]</a>";  
        Pattern pagePattern = Pattern.compile(pageRegex);  
        Matcher pageMatcher = pagePattern.matcher(html);  
        String totalPagesStr = null;  
        int pages = 0;  
        if (pageMatcher.find()) {  
            String pagelink = pageMatcher.group();  
            totalPagesStr = pagelink.replaceAll(  
                    "<a[\\s]href=\"/cnjsp/blog/index/", "").replaceAll(  
                    "\">\\[尾页\\]</a>", "");  
            pages = Integer.parseInt(totalPagesStr);  
        }  
        return pages;  
    }  
  
    /** 
     * <p> 
     * 获取博客的所有博文的地址 <br> 
     * 没有对url进行编码处理，如果博客地址含中文，请对url进行处理 
     *  
     * @param blogUrl 
     *            博客地址 
     * @return 所有博文地址，存放于栈中，使用的时候请使用pop方法取出元素，这样可以保证按照最先发表的博文最先处理 
     */  
    public Stack<String> getAllPostLink(String blogUrl) {  
        Stack<String> posts = new Stack<String>();  
        // 1.下载第一页  
        String firstPageHtml = downloadPage(blogUrl + "/blog/index/0");  
        // 2.获取博文总页数  
//      int totalPages = getTotalPages(firstPageHtml);  
        int totalPages = 2;  
        // 3.下载各摘要页  
        posts.addAll(getPostLinks(firstPageHtml));  
        if (totalPages < 1) {  
            return posts;  
        }  
        for (int i = 1; i <= totalPages; i++) {  
            String page = downloadPage(blogUrl + "/blog/index/" + i);  
            posts.addAll(getPostLinks(page));  
        }  
        return posts;  
    }  
  
    /** 
     * 解析博文，获取标题，发布时间，内容，分类等信息 
     *  
     * @param postUrl 
     *            博文地址 
     * @return 封装了博文信息的BaiduHi 
     */  
    public BaiduHi getBaiduHi(String postUrl) {  
        String html = downloadPage(postUrl);  
        // /<div class="tit">  
        String titleDivRegex = "<div[\\s]id=\"m_blog\"[\\s]class=\"modbox\"[\\s]style=\"overflow-x:hidden;\"><div[\\s]class=\"tit\">.+?</div><div[\\s]class=\"date\">";  
        Pattern titleDivPattern = Pattern.compile(titleDivRegex);  
        Matcher titleDivMatcher = titleDivPattern.matcher(html);  
        String title = null;  
        if (titleDivMatcher.find()) {  
            title = titleDivMatcher  
                    .group()  
                    .replaceAll(  
                            "<div[\\s]id=\"m_blog\"[\\s]class=\"modbox\"[\\s]style=\"overflow-x:hidden;\"><div[\\s]class=\"tit\">",  
                            "")  
                    .replaceAll("</div><div[\\s]class=\"date\">", "").trim();  
        }  
        String dateDivRegex = "<div[\\s]class=\"date\">.+?</div>";  
        Pattern dateDivPattern = Pattern.compile(dateDivRegex);  
        Matcher dateMatcher = dateDivPattern.matcher(html);  
        String dateStr = null;  
        Date postDate = null;  
        if (dateMatcher.find()) {  
            dateStr = dateMatcher.group().replaceAll(  
                    "<div[\\s]class=\"date\">", "").replaceAll("</div>", "")  
                    .trim();  
            postDate = getDate(dateStr);  
        }  
        String textDivRegex = "<div[\\s]id=\"blog_text\"[\\s]class=\"cnt\"[\\s]+>.+?</div>";  
        Pattern textDivPattern = Pattern.compile(textDivRegex);  
        Matcher textMatcher = textDivPattern.matcher(html);  
        String text = null;  
        if (textMatcher.find()) {  
            text = textMatcher.group().replaceAll(  
                    "<div[\\s]id=\"blog_text\"[\\s]class=\"cnt\"[\\s]+>", "")  
                    .replaceAll("</div>", "").trim();  
        }  
        String categoriesRegex = "title=\"查看该分类中所有文章\">类别：.+?</a>";  
        Pattern categoriesDivPattern = Pattern.compile(categoriesRegex);  
        Matcher categoriesMatcher = categoriesDivPattern.matcher(html);  
        String categories = null;  
        if (categoriesMatcher.find()) {  
            categories = categoriesMatcher.group().replaceAll(  
                    "title=\"查看该分类中所有文章\">类别：", "").replaceAll("</a>", "")  
                    .trim();  
        }  
        BaiduHi hi = new BaiduHi();  
        hi.setTitle(title);  
        hi.setDescription(text);  
        hi.setCategories(categories);  
        hi.setDateCreated(postDate);  
        return hi;  
    }  
  
    /** 
     * 解析博文中的日期格式返回Date类型 
     * 日期格式为：2011年07月01日 星期五 下午 01:05 
     * @param str 
     *            博文中的日期 
     * @return Date类型日期 
     */  
    @SuppressWarnings("deprecation")  
    private Date getDate(String str) {  
        String yearStr = str.substring(0, str.indexOf("年")).trim();  
        String monthStr = str.substring(str.indexOf("年"), str.indexOf("月"))  
                .replace("年", "").trim();  
        String dayStr = str.substring(str.indexOf("月"), str.indexOf("日"))  
                .replace("月", "").trim();  
        String timeStr = str.substring(str.indexOf("午")).replace("午", "")  
                .trim();  
        String hourStr = timeStr.split(":")[0];  
        String minutesStr = timeStr.split(":")[1];  
        Date date = new Date();  
        date.setYear(Integer.parseInt(yearStr) - 1900);  
        date.setMonth(Integer.parseInt(monthStr) - 1);  
        date.setDate(Integer.parseInt(dayStr));  
        if (str.contains("下午")) {  
            date.setHours(Integer.parseInt(hourStr) + 12);  
        } else {  
            date.setHours(Integer.parseInt(hourStr));  
        }  
        date.setMinutes(Integer.parseInt(minutesStr));  
        return date;  
    }  
}  

Transfer

 
   [java] 
   view plain
   copy
   print
   ? 
  
package cn.mingyuan.baidu2csdn.core;  
  
import java.util.Stack;  
  
/** 
 * 搬家 
 *  
 * @author mingyuanonline@gmail.com 
 *  
 */  
public class Transfer {  
    /** 
     * @param args 
     */  
    public static void main(String[] args) {  
        // TODO Auto-generated method stub  
        //String postUrl = "http://hi.baidu.com/cnjsp";  
        String postUrl = "http://hi.baidu.com/xwdreamer";  
        BaiduHiFetcher fetcher = new BaiduHiFetcher();  
        Stack<String> urls = null;  
        urls = fetcher.getAllPostLink(postUrl);  
        while (!urls.isEmpty()) {  
            String url = urls.pop();  
            BaiduHi hi = null;  
            hi = fetcher.getBaiduHi(url);  
            CSDNPost post = new CSDNPost();  
            post.setTitle(hi.getTitle());  
            post.setDescription(hi.getDescription());  
            post.setCategories(new String[] { hi.getCategories() });  
            post.setDateCreated(hi.getDateCreated());  
            post.publish();  
            try {  
                Thread.sleep(5 * 1000);  
            } catch (InterruptedException e) {  
                System.out.println("休眠出错");  
            }  
        }  
    }  
}  

DeletePostById

 
   [java] 
   view plain
   copy
   print
   ? 
  

package cn.mingyuan.baidu2csdn.core;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.xmlrpc.XmlRpcException;
import org.apache.xmlrpc.client.XmlRpcClient;
import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
public class DeletePostById {
private static XmlRpcClientConfigImpl config;
private static XmlRpcClient client;
static {
config = new XmlRpcClientConfigImpl();
try {
config.setServerURL(new URL(
"http://blog.csdn.net/telnetor/services/metablogapi.aspx"));
} catch (MalformedURLException e) {
System.out.println("请检查url");
}
client = new XmlRpcClient();
client.setConfig(config);
}
/**
* 删除帖子
*
* @param appkey
* appkey，可以任意，这是一个忽略的值
* @param postid
* 帖子id
* @param username
* 用户名
* @param password
* 密码
* @param publish
* 博客在帖子被删除之后是否重新发布
*/
public static void delete(String appkey, String postid, String username,
String password, boolean publish) {
Object[] params = new Object[] { "ignored value", postid, username,
password, true };
try {
client.execute("blogger.deletePost", params);
} catch (XmlRpcException e) {
System.out.println("删除出错，postid=" + postid);
}
System.out.println(postid + "删除完毕");
}
/**
* @param args
* @throws InterruptedException
*/
public static void main(String[] args) throws InterruptedException {
BufferedReader reader = null;
String line;
try {
reader = new BufferedReader(new InputStreamReader(
new FileInputStream("content")));
while ((line = reader.readLine()) != null) {
line = line.split("生成博文id为：")[1];
delete("ignored", line, "your username", "your password", true);
Thread.sleep(1000 * 10);
}
} catch (FileNotFoundException e1) {
System.out.println("文件没找到");
} catch (IOException e) {
System.out.println("读取文件失败");
}
}
}

本文转自xwdreamer博客园博客，原文链接：http://www.cnblogs.com/xwdreamer/archive/2011/07/19/2296977.html，如需转载请自行联系原作者

从百度空间到CSDN——博客搬家源码

1.原文连接

2.心得

3.代码

CSDNPost.java

BaiduHi

BaiduHiFetcher

Transfer

DeletePostById

热门文章

最新文章

相关课程

相关电子书

相关实验场景

热门

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

阿里云MVP

话题

直播

下载

镜像站

技术资料

插件

从百度空间到CSDN——博客搬家源码

1.原文连接

2.心得

3.代码

CSDNPost.java

BaiduHi

BaiduHiFetcher

Transfer

DeletePostById

热门文章

最新文章

相关课程

相关电子书

相关实验场景