主要技术就是bs4
import json import os import time import requests from bs4 import BeautifulSoup from access.sprider.SpriderAccess import SpriderAccess from base.BaseConfig import BaseConfig from base.BaseFrame import BaseFrame from object.entity.SpriderEntity import SpriderEntity from plugin.Tools import Tools class QianQian: base_url = "http://music.taihe.com/" # 采集的网址 sprider_url = ["dayhot", "new", "netsong", "oldsong"] save_path = BaseConfig().CORPUS_ROOT + os.sep + "BaiduMusic" def __init__(self): pass def sprider_top(self, rege=True, xinge=True, wangluo=True, laoge=True): """ 采集热榜音乐 :return: """ self.base_url = self.base_url + "top" + "/" BaseFrame.__log__("开始采集热榜音乐" + self.base_url) for url in self.sprider_url: if rege is False: if url == "dayhot": continue if xinge is False: if url == "new": continue if wangluo is False: if url == "netsong": continue if laoge is False: if url == "oldsong": continue url = self.base_url + url response = requests.get(url) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, "html5lib") a_list = soup.find_all('a', attrs={"data-film": 'null'}) for a in a_list: songid = str(a.get("href")).replace("/song/", "") title = a.get("title") mmEntity = SpriderEntity() # 依据图片执行,下载过的图片不再下载 mmEntity.sprider_base_url = self.base_url mmEntity.create_datetime = Tools.get_current_datetime() mmEntity.sprider_url = url mmEntity.sprider_pic_title = title mmEntity.sprider_pic_index = songid if SpriderAccess().query_sprider_entity_by_urlandindex(url, songid) is None: SpriderAccess().save_sprider(mmEntity) self.get_mp3_address_and_download(songid, title) else: BaseFrame.__log__(title + ".mp3 数据采集过因此跳过") pass BaseFrame.__log__("热榜音乐采集完毕!!!") def get_mp3_address_and_download(self, songid, title): """ 下载MP3音乐 :param songid: MP3地址ID :param title: 音乐名称 :return: """ BaseFrame.__log__("正在下载" + title) try: apiurl = "http://musicapi.taihe.com/v1/restserver/ting" callback = "jQuery17200943498528136486_" + str(round(time.time() * 1000)) hua = str(round(time.time() * 1000)) params = {"method": "baidu.ting.song.playAAC", "format": "jsonp", "songid": songid, "from": "web", "callback": callback, "_": hua} text = json.loads(requests.get(apiurl, params=params).text.split(callback)[1][1:-2]) song_address = text["bitrate"]["file_link"] save_path = self.save_path + os.sep + title + ".mp3" Tools.judge_diskpath_exits_create(self.save_path) mp3w = open(save_path, 'wb') mp3w.write(requests.get(song_address).content) mp3w.close() except Exception as e: BaseFrame.__log__("下载音乐过程出现错误" + str(e)) return def save_mp3_record(self, ): pass if __name__ == '__main__': QianQian().sprider_top() pass