对于采集的图片等资源如果采集过了再一次采集就不对了,浪费硬盘也浪费采集的时间,各种浪费,针对这个问题做了个数据库设计,防止同样的数据二次采集。
因为是pytho语言所以框架采用SQLAlchemy。这玩意网上介绍有很多,洒家就不介绍了。不清楚的问问度娘。看代码吧。
from sqlalchemy import Column from sqlalchemy.types import String from main.base.BaseEntity import BaseEntity class SpriderEntity(BaseEntity): __tablename__ = 'sprider_pic' sprider_base_url = Column(String, primary_key=True) sprider_url = Column(String) sprider_pic_index = Column(String) sprider_pic_title = Column(String) sprider_pager_index = Column(String) sprider_type = Column(String) create_datetime = Column(String)
当时目光短浅了以为只能采集pic就把名写成PIC了。
另外一个核心就是数据的保存和查询,不废话直接代码:
from sqlalchemy import func from main.base.BaseFrame import BaseFrame from main.base.SqliteAccess import SqliteAccess from main.base.MySqlAccess import MySqlAccess import pymysql from sprider.object.SpriderEntity import SpriderEntity from main.plugin.log.Logger import Logger pymysql.install_as_MySQLdb() class SpriderAccess(SqliteAccess): # region 保存采集信息 zhangyu-2019-7-18 def save_sprider(self, model_entity): session = SqliteAccess.connection() try: session.add(model_entity) session.commit() return True except Exception as e: Logger.error("save_sprider:保存数据出现 错误" + str(e)) session.rollback() return False finally: session.close() # endregion def query_sprider_entity_by_urlandindex(self, sprider_url, pic_index): session = SqliteAccess.connection() try: entity = session.query(SpriderEntity).filter_by(sprider_url=str(sprider_url), sprider_pic_index=str(pic_index)).first() return entity except Exception as e: BaseFrame.__err__("query_sprider" + str(e)) return None finally: session.close() def query_sprider_entity_by_urlandtitle(self, sprider_url, sprider_title): session = SqliteAccess.connection() try: entity = session.query(SpriderEntity).filter_by(sprider_url=sprider_url, sprider_pic_title=sprider_title).first() return entity except Exception as e: BaseFrame.__err__("query_sprider_entity_by_urlandtitle"+str(e)) return None finally: session.close() def delete_sprider_info(self, sprider_url, sprider_title): session = MySqlAccess.connection() try: lottery_entity = session.query(SpriderEntity).filter_by(sprider_url=sprider_url, sprider_pic_title=sprider_title).first() if lottery_entity is None: return True session.delete(lottery_entity) session.commit() return True except: session.rollback() return False finally: session.close() pass def find_pager_by_url_and_type(self, sprider_base_url, sprider_type): session = SqliteAccess.connection() try: # entity = session.query(SpriderEntity).filter_by(sprider_base_url=sprider_base_url, # sprider_type=sprider_type).max() entity = session.query(func.max(SpriderEntity.sprider_pager_index)).filter_by( sprider_base_url=sprider_base_url, sprider_type=sprider_type).group_by( SpriderEntity.sprider_pager_index).all() if len(entity) == 0: number = 0 else: for e in entity: number = (e[0]) return int(number) except Exception as e: Logger.error(str(e)) return 0 finally: session.close() pass def query_sprider(self): session = SqliteAccess.connection() try: sprider_list = session.execute("SELECT * FROM sprider_pic LIMIT 10 OFFSET 1").fetchall() return sprider_list except Exception as e: return None, str(e) finally: session.close() pass
里面有一些我框架的东西还望见谅!
数据库是sqlit3的,原始文件我就不上传了,自己动手搞搞,锻炼一下。