Commit 90a51741 authored by jimmy's avatar jimmy

提交数据

parent 0c9219f3
......@@ -13,9 +13,10 @@ def GetCompany(types):
response = requests.get(cyaml.data[cyaml.data["env"]]["url"]+cyaml.data["php-api"]["getcompany"]+"?type="+str(types), headers={"Content-Type": "application/json"})
if response.status_code == 200:
response_data = response.json()
# if response_data.get('name') is not None:
#return "上海临方股权投资管理有限公司","https://www.zhipin.com/web/geek/job?query=%E5%A4%A7%E6%97%8F%E6%BF%80%E5%85%89&city=100010000"
#return "上海临方股权投资管理有限公司","https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E4%B8%B4%E6%96%B9%E8%82%A1%E6%9D%83%E6%8A%95%E8%B5%84%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8?fromModule=lemma_search-box"
if types == 88:#boss测试
return "上海临方股权投资管 理有限公司","https://www.zhipin.com/web/geek/job?query=%E5%A4%A7%E6%97%8F%E6%BF%80%E5%85%89&city=100010000"
elif types==99:#baidu baike
return "上海临方股权投资管理有限公司","https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E4%B8%B4%E6%96%B9%E8%82%A1%E6%9D%83%E6%8A%95%E8%B5%84%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8?fromModule=lemma_search-box"
return response_data["data"]["company_name"],response_data["data"]["url"]
return "", ""
......@@ -45,16 +46,47 @@ def SaveCompanyData(name,types,intro):
#启动爬虫
def CrawlerLaunchTypes():
types = 88
match types:
case 99:
print("百度百科 测试")
com_name1, baike_url = GetCompany(99) # 1 获取公司名字 和 百科链接
if baike_url != "":
crawler_baidu.CrawlerBaidu(com_name1, baike_url)
else:
Log(com_name1 + "-百度 url为空:")
case 88:
print("boss 测试")
while(True):
time.sleep(2)
com_name2, boss_url = GetCompany(88) # 2 获取公司名字 和 boss链接
if boss_url != "":
crawler_boss.CrawlerBoss(com_name2, boss_url)
else:
Log(com_name2 + "-boss url为空:")
case _:
CrawlerLaunch()
def CrawlerLaunch():
while True:
now = datetime.datetime.now()
randomtime = random.randint(3, cyaml.data[cyaml.data["env"]]["randomtime"])
Log(str(now) + ":启动等待时间"+str(randomtime))
print(str(now) + ":启动等待时间"+str(randomtime))
time.sleep(randomtime)
res1 = crawler_baidu.CrawlerBaidu()
if res1==100:
time.sleep(5)
crawler_boss.CrawlerBoss()
com_name1, baike_url = GetCompany(1)#1 获取公司名字 和 百科链接
if baike_url != "":
crawler_baidu.CrawlerBaidu(com_name1,baike_url)
else:
Log(com_name1 + "-百度 url为空:")
com_name2, boss_url = GetCompany(2)#2 获取公司名字 和 boss链接
if boss_url != "":
crawler_boss.CrawlerBoss(com_name2,boss_url)
else:
Log(com_name2 + "-boss url为空:")
def Log(dataS):
with open("log/"+str(datetime.date.today()) + ".txt", "a+") as f:
......@@ -63,7 +95,7 @@ def Log(dataS):
if __name__ == "__main__":
p = multiprocessing.Process(target=CrawlerLaunch)
p = multiprocessing.Process(target=CrawlerLaunchTypes)
p.start()
p.join()
......@@ -6,23 +6,25 @@ import cyaml
import crawler
#爬虫启动
def CrawlerBaidu():
def CrawlerBaidu(com_name,baike_url):
with sync_playwright() as playwright:
return GetBaiduCompany(playwright)
return GetBaiduCompany(playwright,com_name,baike_url)
#爬取百度的数据
def GetBaiduCompany(playwright: Playwright) -> int:
def GetBaiduCompany(playwright: Playwright,com_name,baike_url) -> int:
browser = playwright.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"])
context = browser.new_context(viewport={"width": 800, "height": 600})
page = context.new_page()
name = com_name
url = baike_url
#获取需要爬取数据的公司
name,url = crawler.GetCompany(1)
crawler.Log(name + "-百度开始请求数据:" + url)
if url!="":
crawler.Log("--------------------------baidu baike start--------------------------------")
crawler.Log( "开始请求 " +name+ " 公司的百科地址,地址为:" + url)
page.goto(url)
all = page.locator(".lemma-summary").all_text_contents()
intro = re.sub(r'\[[\d-]+\]', '', str(all))
crawler.Log(name+"-获取到百度数据:"+intro)
crawler.Log(name+" 公司的百科地址上抓取到的数据:"+intro)
if name != "" and intro != '[]':
intro_new = str(intro[2:len(intro) - 2])
new_string1 = re.sub(r'\\n', "", intro_new)
......@@ -30,14 +32,14 @@ def GetBaiduCompany(playwright: Playwright) -> int:
name2 = crawler.SaveCompanyData(name, 1, new_string2)
if name2 != "":
crawler.Log(name2+":百度数据,写入成功")
crawler.Log(name2+" 公司百科数据回传到php 回传成功")
return 100
else:
crawler.Log(name + ":百度数据,写入失败")
crawler.Log(name + "公司百科数据回传到php 回传失败")
return 100
else:
name2 = crawler.SaveCompanyData(name, 1, "")
crawler.Log(name +":百度数据,写入失败")
crawler.SaveCompanyData(name, 1, "")
crawler.Log(name +" 公司没有获取到百科数据回传到php 回传成功")
return 100
# ---------------------
......
......@@ -4,17 +4,16 @@ from playwright.sync_api import Playwright, sync_playwright
import json
import cyaml
def spider_company(page):
company_nameO, url = crawler.GetCompany(2)
crawler.Log(company_nameO + "-boss开始请求数据:" + url)
if url != "":
def spider_company(page,com_name,boss_url):
company_nameO = com_name
url = boss_url
crawler.Log("--------------------------boss start--------------------------------")
crawler.Log("开始请求 " + company_nameO + " 公司的boss地址,地址为:" + url)
page.goto(url)
page.wait_for_timeout(3000)
page.wait_for_selector(".input-wrap-text")
company_detail_el = page.locator('div.company-card-wrapper a')
company_detail_el_count = company_detail_el.count()
crawler.Log("company_detail_el.count():"+str(company_detail_el_count))
my_obj = {'intro': ""}
if company_detail_el_count > 0:
company_detail_el.first.click()
......@@ -30,19 +29,20 @@ def spider_company(page):
if company_nameO in company_name:
my_obj['intro'] = company_intro
crawler.Log(company_name+"-获取到boss数据:"+str(company_intro))
crawler.Log(company_name + " 公司的百科地址上抓取到的数据:" + str(company_intro))
name2 = crawler.SaveCompanyData(company_nameO, 2, str(company_intro))
if name2 != "":
crawler.Log(name2 + ":boss数据,写入成功")
crawler.Log(name2 + " 公司boss数据回传到php 回传成功")
else:
crawler.Log(company_nameO + ":boss数据,写入失败")
crawler.Log(company_nameO + " 公司boss数据回传到php 回传失败")
else:
crawler.SaveCompanyData(company_nameO, 2, "")
name = crawler.SaveCompanyData(company_nameO, 2, "")
if name!="":
crawler.Log(company_nameO + " 公司没有获取到boss数据回传到php 回传成功")
else:
crawler.SaveCompanyData(company_nameO, 2, "")
crawler.Log(company_nameO + " 公司没有获取到boss数据回传到php 回传失败")
def GetBossCompany(p: Playwright) -> None:
def GetBossCompany(p: Playwright,com_name,boss_url) -> None:
browser = p.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"])
context = browser.new_context(viewport={"width": 800, "height": 600})
js = """
......@@ -50,13 +50,13 @@ def GetBossCompany(p: Playwright) -> None:
"""
page = context.new_page()
page.add_init_script(js)
spider_company(page)
spider_company(page,com_name,boss_url)
context.close()
browser.close()
#爬虫启动
def CrawlerBoss():
def CrawlerBoss(com_name,boss_url):
with sync_playwright() as playwright:
GetBossCompany(playwright)
GetBossCompany(playwright,com_name,boss_url)
from playwright.sync_api import Playwright, sync_playwright, expect
import re
import json
import datetime
import crawler
#爬虫启动
def CrawlerFutuNiuniu():
with sync_playwright() as playwright:
return GetFutuCompany(playwright)
def GetFutuCompany(playwright: Playwright) -> int:
browser = playwright.chromium.launch(headless=False)
context = browser.new_context(viewport={"width": 800, "height": 600})
page = context.new_page()
#获取需要爬取数据的公司
name,url = crawler.GetCompany(1)
if url!="":
page.goto(url)
all = page.locator(".lemma-summary").all_text_contents()
intro = re.sub(r'\[[\d-]+\]', '', str(all))
crawler.Log(name+"-获取到futu数据:"+intro)
if name != "" and intro != '[]':
name2 = crawler.SaveCompanyData(name, 1, intro)
if name2 != "":
crawler.Log(name2+":百度数据,写入成功")
return 100
else:
crawler.Log(name + ":百度数据,写入失败")
return 100
else:
crawler.Log(name +":百度数据,写入失败")
return 100
# ---------------------
context.close()
browser.close()
return 100
CrawlerFutuNiuniu()
\ No newline at end of file
from playwright.sync_api import Playwright, sync_playwright, expect
import re
import json
import cyaml
import crawler
#爬虫启动
def CrawlerQqdoc(com_name,url):
with sync_playwright() as playwright:
return GetQqdocCompany(playwright,com_name,url)
#爬取百度的数据
def GetQqdocCompany(playwright: Playwright,com_name,url) -> int:
browser = playwright.chromium.launch(headless=cyaml.data[cyaml.data["env"]]["headless"])
context = browser.new_context(viewport={"width": 800, "height": 600})
page = context.new_page()
name = com_name
#获取需要爬取数据的公司
crawler.Log("--------------------------qqdoc start--------------------------------")
page.goto(url)
page.wait_for_timeout(8000)
page.wait_for_selector(".melo-page-container-view")
# page.locator(".melo-page-container-view").all_text_contents()
# page.frame_locator("iframe[name=\"login_frame\"]").frame_locator("iframe").get_by_role("link", name="Continue to use in browser").click()
page.get_by_label("腾讯文档正文内容").press("Meta+a")
page.get_by_label("腾讯文档正文内容").press("Meta+c")
all = page.context
intro = re.sub(r'\[[\d-]+\]', '', str(all))
crawler.Log(name+" qqdoc上抓取到的数据:"+intro)
if name != "" and intro != '[]':
intro_new = str(intro[2:len(intro) - 2])
new_string1 = re.sub(r'\\n', "", intro_new)
new_string2 = re.sub(r'\\xa0', "", new_string1)
name2 = crawler.SaveCompanyData(name, 1, new_string2)
if name2 != "":
crawler.Log(name2+" qqdoc数据回传到php 回传成功")
return 100
else:
crawler.Log(name + "qqdoc数据回传到php 回传失败")
return 100
else:
crawler.SaveCompanyData(name, 1, "")
crawler.Log(name +" 公司没有获取到qqdoc数据回传到php 回传成功")
return 100
# ---------------------
context.close()
browser.close()
return 100
CrawlerQqdoc("ce","https://doc.weixin.qq.com/doc/w3_AXEAcwZhACknEowzTBmRw6jzHlbf9?scode=AMwA6QetAAYQxyo3EtAcQADQaTAHI")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment