Commit 90a51741 authored by jimmy's avatar jimmy

提交数据

parent 0c9219f3
...@@ -13,9 +13,10 @@ def GetCompany(types): ...@@ -13,9 +13,10 @@ def GetCompany(types):
response = requests.get(cyaml.data[cyaml.data["env"]]["url"]+cyaml.data["php-api"]["getcompany"]+"?type="+str(types), headers={"Content-Type": "application/json"}) response = requests.get(cyaml.data[cyaml.data["env"]]["url"]+cyaml.data["php-api"]["getcompany"]+"?type="+str(types), headers={"Content-Type": "application/json"})
if response.status_code == 200: if response.status_code == 200:
response_data = response.json() response_data = response.json()
# if response_data.get('name') is not None: if types == 88:#boss测试
#return "上海临方股权投资管理有限公司","https://www.zhipin.com/web/geek/job?query=%E5%A4%A7%E6%97%8F%E6%BF%80%E5%85%89&city=100010000" return "上海临方股权投资管 理有限公司","https://www.zhipin.com/web/geek/job?query=%E5%A4%A7%E6%97%8F%E6%BF%80%E5%85%89&city=100010000"
#return "上海临方股权投资管理有限公司","https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E4%B8%B4%E6%96%B9%E8%82%A1%E6%9D%83%E6%8A%95%E8%B5%84%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8?fromModule=lemma_search-box" elif types==99:#baidu baike
return "上海临方股权投资管理有限公司","https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E4%B8%B4%E6%96%B9%E8%82%A1%E6%9D%83%E6%8A%95%E8%B5%84%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8?fromModule=lemma_search-box"
return response_data["data"]["company_name"],response_data["data"]["url"] return response_data["data"]["company_name"],response_data["data"]["url"]
return "", "" return "", ""
...@@ -45,16 +46,47 @@ def SaveCompanyData(name,types,intro): ...@@ -45,16 +46,47 @@ def SaveCompanyData(name,types,intro):
#启动爬虫 #启动爬虫
def CrawlerLaunchTypes():
types = 88
match types:
case 99:
print("百度百科 测试")
com_name1, baike_url = GetCompany(99) # 1 获取公司名字 和 百科链接
if baike_url != "":
crawler_baidu.CrawlerBaidu(com_name1, baike_url)
else:
Log(com_name1 + "-百度 url为空:")
case 88:
print("boss 测试")
while(True):
time.sleep(2)
com_name2, boss_url = GetCompany(88) # 2 获取公司名字 和 boss链接
if boss_url != "":
crawler_boss.CrawlerBoss(com_name2, boss_url)
else:
Log(com_name2 + "-boss url为空:")
case _:
CrawlerLaunch()
def CrawlerLaunch(): def CrawlerLaunch():
while True: while True:
now = datetime.datetime.now() now = datetime.datetime.now()
randomtime = random.randint(3, cyaml.data[cyaml.data["env"]]["randomtime"]) randomtime = random.randint(3, cyaml.data[cyaml.data["env"]]["randomtime"])
Log(str(now) + ":启动等待时间"+str(randomtime))
print(str(now) + ":启动等待时间"+str(randomtime)) print(str(now) + ":启动等待时间"+str(randomtime))
time.sleep(randomtime) time.sleep(randomtime)
res1 = crawler_baidu.CrawlerBaidu()
if res1==100: com_name1, baike_url = GetCompany(1)#1 获取公司名字 和 百科链接
time.sleep(5) if baike_url != "":
crawler_boss.CrawlerBoss() crawler_baidu.CrawlerBaidu(com_name1,baike_url)
else:
Log(com_name1 + "-百度 url为空:")
com_name2, boss_url = GetCompany(2)#2 获取公司名字 和 boss链接
if boss_url != "":
crawler_boss.CrawlerBoss(com_name2,boss_url)
else:
Log(com_name2 + "-boss url为空:")
def Log(dataS): def Log(dataS):
with open("log/"+str(datetime.date.today()) + ".txt", "a+") as f: with open("log/"+str(datetime.date.today()) + ".txt", "a+") as f:
...@@ -63,7 +95,7 @@ def Log(dataS): ...@@ -63,7 +95,7 @@ def Log(dataS):
if __name__ == "__main__": if __name__ == "__main__":
p = multiprocessing.Process(target=CrawlerLaunch) p = multiprocessing.Process(target=CrawlerLaunchTypes)
p.start() p.start()
p.join() p.join()
...@@ -6,23 +6,25 @@ import cyaml ...@@ -6,23 +6,25 @@ import cyaml
import crawler import crawler
#爬虫启动 #爬虫启动
def CrawlerBaidu(): def CrawlerBaidu(com_name,baike_url):
with sync_playwright() as playwright: with sync_playwright() as playwright:
return GetBaiduCompany(playwright) return GetBaiduCompany(playwright,com_name,baike_url)
#爬取百度的数据 #爬取百度的数据
def GetBaiduCompany(playwright: Playwright) -> int: def GetBaiduCompany(playwright: Playwright,com_name,baike_url) -> int:
browser = playwright.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"]) browser = playwright.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"])
context = browser.new_context(viewport={"width": 800, "height": 600}) context = browser.new_context(viewport={"width": 800, "height": 600})
page = context.new_page() page = context.new_page()
name = com_name
url = baike_url
#获取需要爬取数据的公司 #获取需要爬取数据的公司
name,url = crawler.GetCompany(1) crawler.Log("--------------------------baidu baike start--------------------------------")
crawler.Log(name + "-百度开始请求数据:" + url) crawler.Log( "开始请求 " +name+ " 公司的百科地址,地址为:" + url)
if url!="":
page.goto(url) page.goto(url)
all = page.locator(".lemma-summary").all_text_contents() all = page.locator(".lemma-summary").all_text_contents()
intro = re.sub(r'\[[\d-]+\]', '', str(all)) intro = re.sub(r'\[[\d-]+\]', '', str(all))
crawler.Log(name+"-获取到百度数据:"+intro) crawler.Log(name+" 公司的百科地址上抓取到的数据:"+intro)
if name != "" and intro != '[]': if name != "" and intro != '[]':
intro_new = str(intro[2:len(intro) - 2]) intro_new = str(intro[2:len(intro) - 2])
new_string1 = re.sub(r'\\n', "", intro_new) new_string1 = re.sub(r'\\n', "", intro_new)
...@@ -30,14 +32,14 @@ def GetBaiduCompany(playwright: Playwright) -> int: ...@@ -30,14 +32,14 @@ def GetBaiduCompany(playwright: Playwright) -> int:
name2 = crawler.SaveCompanyData(name, 1, new_string2) name2 = crawler.SaveCompanyData(name, 1, new_string2)
if name2 != "": if name2 != "":
crawler.Log(name2+":百度数据,写入成功") crawler.Log(name2+" 公司百科数据回传到php 回传成功")
return 100 return 100
else: else:
crawler.Log(name + ":百度数据,写入失败") crawler.Log(name + "公司百科数据回传到php 回传失败")
return 100 return 100
else: else:
name2 = crawler.SaveCompanyData(name, 1, "") crawler.SaveCompanyData(name, 1, "")
crawler.Log(name +":百度数据,写入失败") crawler.Log(name +" 公司没有获取到百科数据回传到php 回传成功")
return 100 return 100
# --------------------- # ---------------------
......
...@@ -4,17 +4,16 @@ from playwright.sync_api import Playwright, sync_playwright ...@@ -4,17 +4,16 @@ from playwright.sync_api import Playwright, sync_playwright
import json import json
import cyaml import cyaml
def spider_company(page): def spider_company(page,com_name,boss_url):
company_nameO, url = crawler.GetCompany(2) company_nameO = com_name
crawler.Log(company_nameO + "-boss开始请求数据:" + url) url = boss_url
if url != "": crawler.Log("--------------------------boss start--------------------------------")
crawler.Log("开始请求 " + company_nameO + " 公司的boss地址,地址为:" + url)
page.goto(url) page.goto(url)
page.wait_for_timeout(3000) page.wait_for_timeout(3000)
page.wait_for_selector(".input-wrap-text") page.wait_for_selector(".input-wrap-text")
company_detail_el = page.locator('div.company-card-wrapper a') company_detail_el = page.locator('div.company-card-wrapper a')
company_detail_el_count = company_detail_el.count() company_detail_el_count = company_detail_el.count()
crawler.Log("company_detail_el.count():"+str(company_detail_el_count))
my_obj = {'intro': ""} my_obj = {'intro': ""}
if company_detail_el_count > 0: if company_detail_el_count > 0:
company_detail_el.first.click() company_detail_el.first.click()
...@@ -30,19 +29,20 @@ def spider_company(page): ...@@ -30,19 +29,20 @@ def spider_company(page):
if company_nameO in company_name: if company_nameO in company_name:
my_obj['intro'] = company_intro my_obj['intro'] = company_intro
crawler.Log(company_name+"-获取到boss数据:"+str(company_intro)) crawler.Log(company_name + " 公司的百科地址上抓取到的数据:" + str(company_intro))
name2 = crawler.SaveCompanyData(company_nameO, 2, str(company_intro)) name2 = crawler.SaveCompanyData(company_nameO, 2, str(company_intro))
if name2 != "": if name2 != "":
crawler.Log(name2 + ":boss数据,写入成功") crawler.Log(name2 + " 公司boss数据回传到php 回传成功")
else: else:
crawler.Log(company_nameO + ":boss数据,写入失败") crawler.Log(company_nameO + " 公司boss数据回传到php 回传失败")
else: else:
crawler.SaveCompanyData(company_nameO, 2, "") name = crawler.SaveCompanyData(company_nameO, 2, "")
if name!="":
crawler.Log(company_nameO + " 公司没有获取到boss数据回传到php 回传成功")
else: else:
crawler.SaveCompanyData(company_nameO, 2, "") crawler.Log(company_nameO + " 公司没有获取到boss数据回传到php 回传失败")
def GetBossCompany(p: Playwright) -> None: def GetBossCompany(p: Playwright,com_name,boss_url) -> None:
browser = p.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"]) browser = p.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"])
context = browser.new_context(viewport={"width": 800, "height": 600}) context = browser.new_context(viewport={"width": 800, "height": 600})
js = """ js = """
...@@ -50,13 +50,13 @@ def GetBossCompany(p: Playwright) -> None: ...@@ -50,13 +50,13 @@ def GetBossCompany(p: Playwright) -> None:
""" """
page = context.new_page() page = context.new_page()
page.add_init_script(js) page.add_init_script(js)
spider_company(page) spider_company(page,com_name,boss_url)
context.close() context.close()
browser.close() browser.close()
#爬虫启动 #爬虫启动
def CrawlerBoss(): def CrawlerBoss(com_name,boss_url):
with sync_playwright() as playwright: with sync_playwright() as playwright:
GetBossCompany(playwright) GetBossCompany(playwright,com_name,boss_url)
from playwright.sync_api import Playwright, sync_playwright, expect
import re
import json
import datetime
import crawler
#爬虫启动
def CrawlerFutuNiuniu():
with sync_playwright() as playwright:
return GetFutuCompany(playwright)
def GetFutuCompany(playwright: Playwright) -> int:
browser = playwright.chromium.launch(headless=False)
context = browser.new_context(viewport={"width": 800, "height": 600})
page = context.new_page()
#获取需要爬取数据的公司
name,url = crawler.GetCompany(1)
if url!="":
page.goto(url)
all = page.locator(".lemma-summary").all_text_contents()
intro = re.sub(r'\[[\d-]+\]', '', str(all))
crawler.Log(name+"-获取到futu数据:"+intro)
if name != "" and intro != '[]':
name2 = crawler.SaveCompanyData(name, 1, intro)
if name2 != "":
crawler.Log(name2+":百度数据,写入成功")
return 100
else:
crawler.Log(name + ":百度数据,写入失败")
return 100
else:
crawler.Log(name +":百度数据,写入失败")
return 100
# ---------------------
context.close()
browser.close()
return 100
CrawlerFutuNiuniu()
\ No newline at end of file
from playwright.sync_api import Playwright, sync_playwright, expect
import re
import json
import cyaml
import crawler
#爬虫启动
def CrawlerQqdoc(com_name,url):
with sync_playwright() as playwright:
return GetQqdocCompany(playwright,com_name,url)
#爬取百度的数据
def GetQqdocCompany(playwright: Playwright,com_name,url) -> int:
browser = playwright.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"])
context = browser.new_context(viewport={"width": 800, "height": 600})
page = context.new_page()
name = com_name
#获取需要爬取数据的公司
crawler.Log("--------------------------qqdoc start--------------------------------")
page.goto(url)
page.wait_for_timeout(8000)
page.wait_for_selector(".melo-page-container-view")
# page.locator(".melo-page-container-view").all_text_contents()
# page.frame_locator("iframe[name=\"login_frame\"]").frame_locator("iframe").get_by_role("link", name="Continue to use in browser").click()
page.get_by_label("腾讯文档正文内容").press("Meta+a")
page.get_by_label("腾讯文档正文内容").press("Meta+c")
all = page.context
intro = re.sub(r'\[[\d-]+\]', '', str(all))
crawler.Log(name+" qqdoc上抓取到的数据:"+intro)
if name != "" and intro != '[]':
intro_new = str(intro[2:len(intro) - 2])
new_string1 = re.sub(r'\\n', "", intro_new)
new_string2 = re.sub(r'\\xa0', "", new_string1)
name2 = crawler.SaveCompanyData(name, 1, new_string2)
if name2 != "":
crawler.Log(name2+" qqdoc数据回传到php 回传成功")
return 100
else:
crawler.Log(name + "qqdoc数据回传到php 回传失败")
return 100
else:
crawler.SaveCompanyData(name, 1, "")
crawler.Log(name +" 公司没有获取到qqdoc数据回传到php 回传成功")
return 100
# ---------------------
context.close()
browser.close()
return 100
CrawlerQqdoc("ce","https://doc.weixin.qq.com/doc/w3_AXEAcwZhACknEowzTBmRw6jzHlbf9?scode=AMwA6QetAAYQxyo3EtAcQADQaTAHI")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment