classMyParser(object): deflogin_data_parser(self,login_url): ''' This parser is for chd :param url: the url you want to login :return (a dict with login data,cookies) ''' response=requests.get(login_url) html=response.text # parse the html soup=BeautifulSoup(html,'lxml') #insert parser,following is an example example_data=soup.find('input',{'name': 'example_data'})['value'] login_data={ 'example_data':example_data } return login_data,response.cookies
''' get all urls that needs to crawl. ''' #prepare base_url='http://example.cn/' cata_base_url=catalogue_url.split('?')[0] para = { 'pageIndex': 1 }
#get the number of pages xpath='//*[@id="page_num"]/text()' page_num=int(self.uni_parser(cata_base_url,xpath,params=para,**kwargs))
#repeat get single catalogue's urls xpath='//a/@href'#link tag's xpath url_list=[]
for i inrange(1,page_num+1): para['pageIndex'] = i #get single catalogue's urls urls=self.uni_parser(cata_base_url,xpath,params=para,**kwargs) for url in urls: url_list.append(base_url+str(url))
return url_list
defget_content(self,url,**kwargs): ''' get content from the parameter "url" ''' html=requests.post(url,**kwargs).text soup=BeautifulSoup(html,'lxml') content=soup.find('div',id='content') content=str(content) return content
definsert(self,db,table,record_dict): ''' :param db:name of database that you want to use :param table:name of table that you want to use :param record_dict:key for column,value for value ''' #1.use the database sql='use {}'.format(db) self.cursor.execute(sql) self.conn.commit()
#2.connect the sql commend sql='insert into {}('.format(table)
record_list=list(record_dict.items())
for r in record_list: sql += str(r[0]) if r != record_list[-1]: sql += ','
sql+=') values('
for r in record_list: sql += '"' sql += str(r[1]) sql += '"' if r != record_list[-1]: sql += ',' sql+=')'
classMySpider(object): def__init__(self,parser,save,**save_params): self.parser=parser#parser is a object of class self.save=save#save is a function self.save_params=save_params self.cookies=None self.headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" }
deflogin(self,login_url,home_page_url): ''' login :param login_url: the url you want to login :param login_data_parser: a callback function to get the login_data you need when you login,return (login_data,response.cookies) :param target_url: Used to determine if you have logged in successfully :return: response of login '''
login_data=None
#get the login data login_data,cookies=self.parser.login_data_parser(login_url)
#login without redirecting response=requests.post(login_url,headers=self.headers,data=login_data,cookies=cookies,allow_redirects=False)
cookies_num=1 while(home_page_url!=Noneand response.url!=home_page_url):#if spider is not reach the target page print('[spider]: I am at the "{}" now'.format(response.url)) print('[spider]: I have got a cookie!Its content is that \n"{}"'.format(response.cookies)) #merge the two cookies cookies=dict(cookies,**response.cookies) cookies=requests.utils.cookiejar_from_dict(cookies) cookies_num+=1 print('[spider]: Now I have {} cookies!'.format(cookies_num)) next_station=response.headers['Location'] print('[spider]: Then I will go to the page whose url is "{}"'.format(next_station)) response=requests.post(next_station,headers=self.headers,cookies=cookies,allow_redirects=False)
defcrawl(self,login_url,home_page_url,catalogue_url): self.login(login_url,home_page_url) url_list=self.parser.get_urls(catalogue_url,cookies=self.cookies,headers=self.headers) for url in url_list: content=self.parser.get_content(url,cookies=self.cookies,headers=self.headers) self.save(content,**self.save_params)
from my_spider import MySpider from my_parser import MyParser from my_database import MyDatabase from bs4 import BeautifulSoup import requests import pymysql
classchdParser(MyParser):
deflogin_data_parser(self,login_url): ''' This parser is for chd :param url: the url you want to login :return (a dict with login data,cookies) ''' pass return login_data,response.cookies
defget_urls(self,catalogue_url,**kwargs): ''' get all urls that needs to crawl. ''' #prepare pass
#get page number pass
#repeat get single catalogue's urls pass for i inrange(1,page_num+1): para['pageIndex'] = i #get single catalogue's urls pass return url_list