Commit e909d7c9 authored by jimmy蒋政彪's avatar jimmy蒋政彪

Merge branch 'dev-charles-private' into 'master'

fixed:爬虫增加重试机制,新增日志,新增异常截图

See merge request !1
parents 0c9219f3 59f3fcb8
import datetime
import hashlib
import multiprocessing
import time
import json
import redis
import crawler_baidu
import crawler_boss
import random
import requests
import cyaml
#获取需要爬数据的企业 百度1
WAIT_SPIDER_BAIKE_COMPANY_LIST = "sw:wait:spider:bk:company:list"
WAIT_SPIDER_BOSS_COMPANY_LIST = "sw:wait:spider:boss:company:list"
# 获取需要爬数据的企业 百度1
def GetCompany(types):
response = requests.get(cyaml.data[cyaml.data["env"]]["url"]+cyaml.data["php-api"]["getcompany"]+"?type="+str(types), headers={"Content-Type": "application/json"})
response = requests.get(
cyaml.data[cyaml.data["env"]]["url"] + cyaml.data["php-api"]["getcompany"] + "?type=" + str(types),
headers={"Content-Type": "application/json"})
if response.status_code == 200:
response_data = response.json()
# if response_data.get('name') is not None:
#return "上海临方股权投资管理有限公司","https://www.zhipin.com/web/geek/job?query=%E5%A4%A7%E6%97%8F%E6%BF%80%E5%85%89&city=100010000"
#return "上海临方股权投资管理有限公司","https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E4%B8%B4%E6%96%B9%E8%82%A1%E6%9D%83%E6%8A%95%E8%B5%84%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8?fromModule=lemma_search-box"
return response_data["data"]["company_name"],response_data["data"]["url"]
# return "上海临方股权投资管理有限公司","https://www.zhipin.com/web/geek/job?query=%E5%A4%A7%E6%97%8F%E6%BF%80%E5%85%89&city=100010000"
# return "上海临方股权投资管理有限公司","https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E4%B8%B4%E6%96%B9%E8%82%A1%E6%9D%83%E6%8A%95%E8%B5%84%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8?fromModule=lemma_search-box"
return response_data["data"]["company_name"], response_data["data"]["url"]
return "", ""
#回传爬虫获取到的数据
def SaveCompanyData(name,types,intro):
my_obj={}
my_obj["intro"] = intro
# 重新放入采集队列
def DoSpiderCompany(company, crawler_website):
if crawler_website == "baike":
redis_key = WAIT_SPIDER_BAIKE_COMPANY_LIST
elif crawler_website == "boss":
redis_key = WAIT_SPIDER_BOSS_COMPANY_LIST
else:
redis_key = ""
if redis_key != "":
r = GetRedisConnect()
# 先删除,再添加
r.lrem(redis_key, -1, company)
r.rpush(redis_key, company)
return 200
# 回传爬虫获取到的数据
def SaveCompanyData(name, types, intro):
my_obj = {"intro": intro}
post_data = {
"name": name,
"content": json.dumps(my_obj),
"type": types
}
json_data = json.dumps(post_data)
response = requests.post(cyaml.data[cyaml.data["env"]]["url"]+cyaml.data["php-api"]["getcompany"], data=json_data, headers={"Content-Type": "application/json"})
response = requests.post(cyaml.data[cyaml.data["env"]]["url"] + cyaml.data["php-api"]["getcompany"], data=json_data,
headers={"Content-Type": "application/json"})
# Log("保存数据时候的返回:" + response)
# 检查响应状态码
if response.status_code == 200:
response_data = response.json()
if response_data["code"]==200:
if response_data["code"] == 200:
return name
else:
return ""
......@@ -43,27 +68,79 @@ def SaveCompanyData(name,types,intro):
return ""
#启动爬虫
# 启动爬虫
def CrawlerLaunch():
while True:
now = datetime.datetime.now()
randomtime = random.randint(3, cyaml.data[cyaml.data["env"]]["randomtime"])
print(str(now) + ":启动等待时间"+str(randomtime))
print(str(now) + ":启动等待时间" + str(randomtime))
time.sleep(randomtime)
res1 = crawler_baidu.CrawlerBaidu()
if res1==100:
time.sleep(5)
crawler_baidu.CrawlerBaidu()
crawler_boss.CrawlerBoss()
def Log(dataS):
with open("log/"+str(datetime.date.today()) + ".txt", "a+") as f:
f.write(dataS + "\n")
now = datetime.datetime.now()
date_time = now.strftime('%Y-%m-%d %H:%M:%S')
with open("log/" + str(datetime.date.today()) + ".txt", "a+") as f:
f.write(date_time + ":" + dataS + "\n")
f.close()
def GetRedisConnect():
conf = cyaml.data[cyaml.data["env"]]
r = redis.StrictRedis(host=conf['redis_host'], port=conf['redis_port'], db=conf['redis_db'],
password=conf['redis_password'], client_name=conf['redis_client'])
return r
def Md5Encode(value: str):
# 创建md5对象
md5 = hashlib.md5()
# 更新md5对象
md5.update(value.encode("utf-8"))
# 获取md5值
result = md5.hexdigest()
return result
def CheckUpperLimit(name, crawler_org):
# 未获取到百科资料,并且未达到最大次数
r = GetRedisConnect()
redis_key = name + crawler_org
# 对公司名称进行md5转换为唯一id
identifier = Md5Encode(redis_key)
# 查询当前已查询次数
r_data = r.get(identifier)
if r_data is not None:
upper_limit = int(cyaml.data[cyaml.data["env"]]['upper_limit'])
json_data = json.loads(r_data)
num = int(json_data.get(crawler_org, 0))
Log(name + ":" + crawler_org + "数据" + ":当前次数为:" + str(num))
# 查询次数达到设定值,通知dwp已爬取无内容
if num >= upper_limit:
if crawler_org == "baike":
SaveCompanyData(name, 1, "")
elif crawler_org == "boss":
SaveCompanyData(name, 2, "")
# 删除达到设定值的key
r.delete(identifier)
Log(name + ":" + crawler_org + "数据" + ":已达到上限,同步dwp修改状态为以爬取无内容,当前次数为:" + str(num))
else:
num += 1
json_data[crawler_org] = num
r.set(identifier, json.dumps(json_data))
Log(name + "未达到上限,次数+1,当前次数为:" + str(num))
else:
# 当前redis没有该key值,初始化
json_dict = {crawler_org: "1"}
r.set(identifier, json.dumps(json_dict))
num = 1
Log(name + ":" + crawler_org + "数据,数据查询无内容,当前查询次数:" + str(num))
return num
if __name__ == "__main__":
p = multiprocessing.Process(target=CrawlerLaunch)
p.start()
p.join()
......@@ -5,24 +5,29 @@ import json
import cyaml
import crawler
#爬虫启动
# 爬虫启动
def CrawlerBaidu():
with sync_playwright() as playwright:
return GetBaiduCompany(playwright)
#爬取百度的数据
# 爬取百度的数据
def GetBaiduCompany(playwright: Playwright) -> int:
browser = playwright.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"])
context = browser.new_context(viewport={"width": 800, "height": 600})
# 获取需要爬取数据的公司
name, url = crawler.GetCompany(1)
if url != "":
headless = bool(cyaml.data[cyaml.data["env"]]["headless"])
browser = playwright.chromium.launch(headless=headless)
context = browser.new_context(viewport={"width": 1280, "height": 960})
page = context.new_page()
#获取需要爬取数据的公司
name,url = crawler.GetCompany(1)
crawler.Log(name + "-百度开始请求数据:" + url)
if url!="":
page.goto(url)
all = page.locator(".lemma-summary").all_text_contents()
intro = re.sub(r'\[[\d-]+\]', '', str(all))
crawler.Log(name+"-获取到百度数据:"+intro)
all_summary = page.locator(".lemma-summary").all_text_contents()
intro = re.sub(r'\[[\d-]+\]', '', str(all_summary))
crawler.Log(name + "-获取到百度数据:" + intro)
try:
if name != "" and intro != '[]':
intro_new = str(intro[2:len(intro) - 2])
new_string1 = re.sub(r'\\n', "", intro_new)
......@@ -30,17 +35,24 @@ def GetBaiduCompany(playwright: Playwright) -> int:
name2 = crawler.SaveCompanyData(name, 1, new_string2)
if name2 != "":
crawler.Log(name2+":百度数据,写入成功")
return 100
crawler.Log(name2 + ":百度数据,写入成功")
else:
crawler.Log(name + ":百度数据,写入失败")
return 100
else:
name2 = crawler.SaveCompanyData(name, 1, "")
crawler.Log(name +":百度数据,写入失败")
return 100
# ---------------------
num = crawler.CheckUpperLimit(name, "baike")
upper_limit = int(cyaml.data[cyaml.data["env"]]["upper_limit"])
if num < upper_limit:
# 未达到请求上限,重新推入redis队列
crawler.Log(name + ":百科查询为空,未达到请求上限,重新推入redis队列")
crawler.DoSpiderCompany(name, "baike")
except Exception as e:
crawler.Log(name + " 百科请求报错:" + str(e))
if name != "":
path = "img/" + name + "-baike.png"
page.screenshot(path=path)
# 重新将公司推进队列
# crawler.DoSpiderCompany(name, "baike")
finally:
context.close()
browser.close()
return 100
......@@ -4,16 +4,18 @@ from playwright.sync_api import Playwright, sync_playwright
import json
import cyaml
def spider_company(page):
company_nameO, url = crawler.GetCompany(2)
crawler.Log(company_nameO + "-boss开始请求数据:" + url)
if url != "":
crawler.Log(company_nameO + "-boss开始请求数据:" + url)
try:
page.goto(url)
page.wait_for_timeout(3000)
page.wait_for_selector(".input-wrap-text")
company_detail_el = page.locator('div.company-card-wrapper a')
company_detail_el_count = company_detail_el.count()
crawler.Log("company_detail_el.count():"+str(company_detail_el_count))
crawler.Log("boss company_detail_el.count():" + str(company_detail_el_count))
my_obj = {'intro': ""}
if company_detail_el_count > 0:
......@@ -30,13 +32,30 @@ def spider_company(page):
if company_nameO in company_name:
my_obj['intro'] = company_intro
crawler.Log(company_name+"-获取到boss数据:"+str(company_intro))
crawler.Log(company_name + "-获取到boss数据:" + str(company_intro))
name2 = crawler.SaveCompanyData(company_nameO, 2, str(company_intro))
if name2 != "":
crawler.Log(name2 + ":boss数据,写入成功")
else:
crawler.Log(company_nameO + ":boss数据,写入失败")
else:
# 未查询到公司内容,截图当前页面
# if company_nameO != "":
# path = "img/" + company_nameO + "-boss-info.png"
# page.screenshot(path=path)
# 重新将公司推进队列
# 暂时不重推队列,boss有验证码,未解决前不建议开启
# crawler.DoSpiderCompany(company_nameO)
crawler.SaveCompanyData(company_nameO, 2, "")
except Exception as e:
crawler.Log(company_nameO + " boss请求报错:" + str(e))
# if company_nameO != "":
# path = "img/" + company_nameO + "-boss-error.png"
# page.screenshot(path=path)
# 重新将公司推进队列
# 暂时不重推队列,boss有验证码,未解决前不建议开启
# crawler.DoSpiderCompany(name, "boss")
finally:
crawler.SaveCompanyData(company_nameO, 2, "")
else:
crawler.SaveCompanyData(company_nameO, 2, "")
......@@ -44,7 +63,7 @@ def spider_company(page):
def GetBossCompany(p: Playwright) -> None:
browser = p.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"])
context = browser.new_context(viewport={"width": 800, "height": 600})
context = browser.new_context(viewport={"width": 1280, "height": 960})
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
......@@ -55,8 +74,8 @@ def GetBossCompany(p: Playwright) -> None:
context.close()
browser.close()
#爬虫启动
# 爬虫启动
def CrawlerBoss():
with sync_playwright() as playwright:
GetBossCompany(playwright)
playwright==1.37.0
Requests==2.31.0
PyYAML==6.0.1
redis~=5.0.1
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment