Commit 59f3fcb8 authored by Charles刘佳佳's avatar Charles刘佳佳

fixed:爬虫增加重试机制,新增日志,新增异常截图

parent 0c9219f3
import datetime import datetime
import hashlib
import multiprocessing import multiprocessing
import time import time
import json import json
import redis
import crawler_baidu import crawler_baidu
import crawler_boss import crawler_boss
import random import random
import requests import requests
import cyaml import cyaml
#获取需要爬数据的企业 百度1
WAIT_SPIDER_BAIKE_COMPANY_LIST = "sw:wait:spider:bk:company:list"
WAIT_SPIDER_BOSS_COMPANY_LIST = "sw:wait:spider:boss:company:list"
# 获取需要爬数据的企业 百度1
def GetCompany(types): def GetCompany(types):
response = requests.get(cyaml.data[cyaml.data["env"]]["url"]+cyaml.data["php-api"]["getcompany"]+"?type="+str(types), headers={"Content-Type": "application/json"}) response = requests.get(
cyaml.data[cyaml.data["env"]]["url"] + cyaml.data["php-api"]["getcompany"] + "?type=" + str(types),
headers={"Content-Type": "application/json"})
if response.status_code == 200: if response.status_code == 200:
response_data = response.json() response_data = response.json()
# if response_data.get('name') is not None: # if response_data.get('name') is not None:
#return "上海临方股权投资管理有限公司","https://www.zhipin.com/web/geek/job?query=%E5%A4%A7%E6%97%8F%E6%BF%80%E5%85%89&city=100010000" # return "上海临方股权投资管理有限公司","https://www.zhipin.com/web/geek/job?query=%E5%A4%A7%E6%97%8F%E6%BF%80%E5%85%89&city=100010000"
#return "上海临方股权投资管理有限公司","https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E4%B8%B4%E6%96%B9%E8%82%A1%E6%9D%83%E6%8A%95%E8%B5%84%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8?fromModule=lemma_search-box" # return "上海临方股权投资管理有限公司","https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E4%B8%B4%E6%96%B9%E8%82%A1%E6%9D%83%E6%8A%95%E8%B5%84%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8?fromModule=lemma_search-box"
return response_data["data"]["company_name"],response_data["data"]["url"] return response_data["data"]["company_name"], response_data["data"]["url"]
return "", "" return "", ""
#回传爬虫获取到的数据 # 重新放入采集队列
def SaveCompanyData(name,types,intro): def DoSpiderCompany(company, crawler_website):
my_obj={} if crawler_website == "baike":
my_obj["intro"] = intro redis_key = WAIT_SPIDER_BAIKE_COMPANY_LIST
elif crawler_website == "boss":
redis_key = WAIT_SPIDER_BOSS_COMPANY_LIST
else:
redis_key = ""
if redis_key != "":
r = GetRedisConnect()
# 先删除,再添加
r.lrem(redis_key, -1, company)
r.rpush(redis_key, company)
return 200
# 回传爬虫获取到的数据
def SaveCompanyData(name, types, intro):
my_obj = {"intro": intro}
post_data = { post_data = {
"name": name, "name": name,
"content": json.dumps(my_obj), "content": json.dumps(my_obj),
"type": types "type": types
} }
json_data = json.dumps(post_data) json_data = json.dumps(post_data)
response = requests.post(cyaml.data[cyaml.data["env"]]["url"]+cyaml.data["php-api"]["getcompany"], data=json_data, headers={"Content-Type": "application/json"}) response = requests.post(cyaml.data[cyaml.data["env"]]["url"] + cyaml.data["php-api"]["getcompany"], data=json_data,
headers={"Content-Type": "application/json"})
# Log("保存数据时候的返回:" + response) # Log("保存数据时候的返回:" + response)
# 检查响应状态码 # 检查响应状态码
if response.status_code == 200: if response.status_code == 200:
response_data = response.json() response_data = response.json()
if response_data["code"]==200: if response_data["code"] == 200:
return name return name
else: else:
return "" return ""
...@@ -43,27 +68,79 @@ def SaveCompanyData(name,types,intro): ...@@ -43,27 +68,79 @@ def SaveCompanyData(name,types,intro):
return "" return ""
# 启动爬虫
#启动爬虫
def CrawlerLaunch(): def CrawlerLaunch():
while True: while True:
now = datetime.datetime.now() now = datetime.datetime.now()
randomtime = random.randint(3, cyaml.data[cyaml.data["env"]]["randomtime"]) randomtime = random.randint(3, cyaml.data[cyaml.data["env"]]["randomtime"])
print(str(now) + ":启动等待时间"+str(randomtime)) print(str(now) + ":启动等待时间" + str(randomtime))
time.sleep(randomtime) time.sleep(randomtime)
res1 = crawler_baidu.CrawlerBaidu() crawler_baidu.CrawlerBaidu()
if res1==100:
time.sleep(5)
crawler_boss.CrawlerBoss() crawler_boss.CrawlerBoss()
def Log(dataS): def Log(dataS):
with open("log/"+str(datetime.date.today()) + ".txt", "a+") as f: now = datetime.datetime.now()
f.write(dataS + "\n") date_time = now.strftime('%Y-%m-%d %H:%M:%S')
with open("log/" + str(datetime.date.today()) + ".txt", "a+") as f:
f.write(date_time + ":" + dataS + "\n")
f.close() f.close()
def GetRedisConnect():
conf = cyaml.data[cyaml.data["env"]]
r = redis.StrictRedis(host=conf['redis_host'], port=conf['redis_port'], db=conf['redis_db'],
password=conf['redis_password'], client_name=conf['redis_client'])
return r
def Md5Encode(value: str):
# 创建md5对象
md5 = hashlib.md5()
# 更新md5对象
md5.update(value.encode("utf-8"))
# 获取md5值
result = md5.hexdigest()
return result
def CheckUpperLimit(name, crawler_org):
# 未获取到百科资料,并且未达到最大次数
r = GetRedisConnect()
redis_key = name + crawler_org
# 对公司名称进行md5转换为唯一id
identifier = Md5Encode(redis_key)
# 查询当前已查询次数
r_data = r.get(identifier)
if r_data is not None:
upper_limit = int(cyaml.data[cyaml.data["env"]]['upper_limit'])
json_data = json.loads(r_data)
num = int(json_data.get(crawler_org, 0))
Log(name + ":" + crawler_org + "数据" + ":当前次数为:" + str(num))
# 查询次数达到设定值,通知dwp已爬取无内容
if num >= upper_limit:
if crawler_org == "baike":
SaveCompanyData(name, 1, "")
elif crawler_org == "boss":
SaveCompanyData(name, 2, "")
# 删除达到设定值的key
r.delete(identifier)
Log(name + ":" + crawler_org + "数据" + ":已达到上限,同步dwp修改状态为以爬取无内容,当前次数为:" + str(num))
else:
num += 1
json_data[crawler_org] = num
r.set(identifier, json.dumps(json_data))
Log(name + "未达到上限,次数+1,当前次数为:" + str(num))
else:
# 当前redis没有该key值,初始化
json_dict = {crawler_org: "1"}
r.set(identifier, json.dumps(json_dict))
num = 1
Log(name + ":" + crawler_org + "数据,数据查询无内容,当前查询次数:" + str(num))
return num
if __name__ == "__main__": if __name__ == "__main__":
p = multiprocessing.Process(target=CrawlerLaunch) p = multiprocessing.Process(target=CrawlerLaunch)
p.start() p.start()
p.join() p.join()
...@@ -5,24 +5,29 @@ import json ...@@ -5,24 +5,29 @@ import json
import cyaml import cyaml
import crawler import crawler
#爬虫启动
# 爬虫启动
def CrawlerBaidu(): def CrawlerBaidu():
with sync_playwright() as playwright: with sync_playwright() as playwright:
return GetBaiduCompany(playwright) return GetBaiduCompany(playwright)
#爬取百度的数据
# 爬取百度的数据
def GetBaiduCompany(playwright: Playwright) -> int: def GetBaiduCompany(playwright: Playwright) -> int:
browser = playwright.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"]) # 获取需要爬取数据的公司
context = browser.new_context(viewport={"width": 800, "height": 600}) name, url = crawler.GetCompany(1)
if url != "":
headless = bool(cyaml.data[cyaml.data["env"]]["headless"])
browser = playwright.chromium.launch(headless=headless)
context = browser.new_context(viewport={"width": 1280, "height": 960})
page = context.new_page() page = context.new_page()
#获取需要爬取数据的公司
name,url = crawler.GetCompany(1)
crawler.Log(name + "-百度开始请求数据:" + url) crawler.Log(name + "-百度开始请求数据:" + url)
if url!="":
page.goto(url) page.goto(url)
all = page.locator(".lemma-summary").all_text_contents() all_summary = page.locator(".lemma-summary").all_text_contents()
intro = re.sub(r'\[[\d-]+\]', '', str(all)) intro = re.sub(r'\[[\d-]+\]', '', str(all_summary))
crawler.Log(name+"-获取到百度数据:"+intro) crawler.Log(name + "-获取到百度数据:" + intro)
try:
if name != "" and intro != '[]': if name != "" and intro != '[]':
intro_new = str(intro[2:len(intro) - 2]) intro_new = str(intro[2:len(intro) - 2])
new_string1 = re.sub(r'\\n', "", intro_new) new_string1 = re.sub(r'\\n', "", intro_new)
...@@ -30,17 +35,24 @@ def GetBaiduCompany(playwright: Playwright) -> int: ...@@ -30,17 +35,24 @@ def GetBaiduCompany(playwright: Playwright) -> int:
name2 = crawler.SaveCompanyData(name, 1, new_string2) name2 = crawler.SaveCompanyData(name, 1, new_string2)
if name2 != "": if name2 != "":
crawler.Log(name2+":百度数据,写入成功") crawler.Log(name2 + ":百度数据,写入成功")
return 100
else: else:
crawler.Log(name + ":百度数据,写入失败") crawler.Log(name + ":百度数据,写入失败")
return 100
else: else:
name2 = crawler.SaveCompanyData(name, 1, "") num = crawler.CheckUpperLimit(name, "baike")
crawler.Log(name +":百度数据,写入失败") upper_limit = int(cyaml.data[cyaml.data["env"]]["upper_limit"])
return 100 if num < upper_limit:
# 未达到请求上限,重新推入redis队列
# --------------------- crawler.Log(name + ":百科查询为空,未达到请求上限,重新推入redis队列")
crawler.DoSpiderCompany(name, "baike")
except Exception as e:
crawler.Log(name + " 百科请求报错:" + str(e))
if name != "":
path = "img/" + name + "-baike.png"
page.screenshot(path=path)
# 重新将公司推进队列
# crawler.DoSpiderCompany(name, "baike")
finally:
context.close() context.close()
browser.close() browser.close()
return 100 return 100
...@@ -4,16 +4,18 @@ from playwright.sync_api import Playwright, sync_playwright ...@@ -4,16 +4,18 @@ from playwright.sync_api import Playwright, sync_playwright
import json import json
import cyaml import cyaml
def spider_company(page): def spider_company(page):
company_nameO, url = crawler.GetCompany(2) company_nameO, url = crawler.GetCompany(2)
crawler.Log(company_nameO + "-boss开始请求数据:" + url)
if url != "": if url != "":
crawler.Log(company_nameO + "-boss开始请求数据:" + url)
try:
page.goto(url) page.goto(url)
page.wait_for_timeout(3000) page.wait_for_timeout(3000)
page.wait_for_selector(".input-wrap-text") page.wait_for_selector(".input-wrap-text")
company_detail_el = page.locator('div.company-card-wrapper a') company_detail_el = page.locator('div.company-card-wrapper a')
company_detail_el_count = company_detail_el.count() company_detail_el_count = company_detail_el.count()
crawler.Log("company_detail_el.count():"+str(company_detail_el_count)) crawler.Log("boss company_detail_el.count():" + str(company_detail_el_count))
my_obj = {'intro': ""} my_obj = {'intro': ""}
if company_detail_el_count > 0: if company_detail_el_count > 0:
...@@ -30,13 +32,30 @@ def spider_company(page): ...@@ -30,13 +32,30 @@ def spider_company(page):
if company_nameO in company_name: if company_nameO in company_name:
my_obj['intro'] = company_intro my_obj['intro'] = company_intro
crawler.Log(company_name+"-获取到boss数据:"+str(company_intro)) crawler.Log(company_name + "-获取到boss数据:" + str(company_intro))
name2 = crawler.SaveCompanyData(company_nameO, 2, str(company_intro)) name2 = crawler.SaveCompanyData(company_nameO, 2, str(company_intro))
if name2 != "": if name2 != "":
crawler.Log(name2 + ":boss数据,写入成功") crawler.Log(name2 + ":boss数据,写入成功")
else: else:
crawler.Log(company_nameO + ":boss数据,写入失败") crawler.Log(company_nameO + ":boss数据,写入失败")
else: else:
# 未查询到公司内容,截图当前页面
# if company_nameO != "":
# path = "img/" + company_nameO + "-boss-info.png"
# page.screenshot(path=path)
# 重新将公司推进队列
# 暂时不重推队列,boss有验证码,未解决前不建议开启
# crawler.DoSpiderCompany(company_nameO)
crawler.SaveCompanyData(company_nameO, 2, "")
except Exception as e:
crawler.Log(company_nameO + " boss请求报错:" + str(e))
# if company_nameO != "":
# path = "img/" + company_nameO + "-boss-error.png"
# page.screenshot(path=path)
# 重新将公司推进队列
# 暂时不重推队列,boss有验证码,未解决前不建议开启
# crawler.DoSpiderCompany(name, "boss")
finally:
crawler.SaveCompanyData(company_nameO, 2, "") crawler.SaveCompanyData(company_nameO, 2, "")
else: else:
crawler.SaveCompanyData(company_nameO, 2, "") crawler.SaveCompanyData(company_nameO, 2, "")
...@@ -44,7 +63,7 @@ def spider_company(page): ...@@ -44,7 +63,7 @@ def spider_company(page):
def GetBossCompany(p: Playwright) -> None: def GetBossCompany(p: Playwright) -> None:
browser = p.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"]) browser = p.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"])
context = browser.new_context(viewport={"width": 800, "height": 600}) context = browser.new_context(viewport={"width": 1280, "height": 960})
js = """ js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}}); Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
""" """
...@@ -55,8 +74,8 @@ def GetBossCompany(p: Playwright) -> None: ...@@ -55,8 +74,8 @@ def GetBossCompany(p: Playwright) -> None:
context.close() context.close()
browser.close() browser.close()
#爬虫启动
# 爬虫启动
def CrawlerBoss(): def CrawlerBoss():
with sync_playwright() as playwright: with sync_playwright() as playwright:
GetBossCompany(playwright) GetBossCompany(playwright)
playwright==1.37.0 playwright==1.37.0
Requests==2.31.0 Requests==2.31.0
PyYAML==6.0.1 PyYAML==6.0.1
redis~=5.0.1
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment