Commit 7d4d8b4c authored by jimmy's avatar jimmy

代码入残酷

parent 69c426c8
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/pw-py.iml" filepath="$PROJECT_DIR$/.idea/pw-py.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.11" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
import datetime
import multiprocessing
import time
import json
import crawler_baidu
import crawler_boss
import requests
#获取需要爬数据的企业 百度1
def GetCompany(types):
response = requests.get("http://192.168.25.19/smart-writing/company/spider-company", "type="+str(types), headers={"Content-Type": "application/json"})
if response.status_code == 200:
response_data = response.json()
return response_data["data"]["company_name"],response_data["data"]["url"]
else:
return "",""
#回传爬虫获取到的数据
def SaveCompanyData(name,types,intro):
my_obj={}
my_obj["intro"] = intro
post_data = {
"name": name,
"content": json.dumps(my_obj),
"type": types
}
json_data = json.dumps(post_data)
# 将 JSON 数据转换为字符串
print("json_data:", json_data)
# 发送 POST 请求并传递 JSON 数据
response = requests.post("http://192.168.25.19/smart-writing/company/spider-company", data=json_data, headers={"Content-Type": "application/json"})
# 检查响应状态码
if response.status_code == 200:
response_data = response.json()
if response_data["code"]==200:
return name
else:
return ""
else:
return ""
#启动爬虫
def CrawlerLaunch():
while True:
time.sleep(3)
now = datetime.datetime.now()
print(str(now)+":启动")
# crawler_baidu.CrawlerBaidu()
crawler_boss.CrawlerBoss()
# 在进程中执行的任务
def Log():
file = open(str(datetime.date.today()) + ".txt", "a")
if __name__ == "__main__":
p = multiprocessing.Process(target=CrawlerLaunch)
p.start()
p.join()
from playwright.sync_api import Playwright, sync_playwright, expect
import re
import json
import datetime
import crawler
#爬虫启动
def CrawlerBaidu():
with sync_playwright() as playwright:
GetBaiduCompany(playwright)
#爬取百度的数据
def GetBaiduCompany(playwright: Playwright) -> None:
browser = playwright.chromium.launch(headless=True)
context = browser.new_context(viewport={"width": 800, "height": 600})
page = context.new_page()
#获取需要爬取数据的公司
name,url = crawler.GetCompany(1)
if url!="":
page.goto(url)
all = page.locator(".lemma-summary").all_text_contents()
intro = re.sub(r'\[[\d-]+\]', '', str(all))
name2 = crawler.SaveCompanyData(name,1,intro)
if name2 != "" and intro != None:
file = open(str(datetime.date.today()) + ".txt", "a")
file.write(name2+":写入成功\n")
file.close()
else:
file = open("example.txt", "a")
file.write(name2+":写入失败\n")
file.close()
# ---------------------
context.close()
browser.close()
import crawler
import requests
from playwright.sync_api import Playwright, sync_playwright
import json
def spider_company(page):
company_nameO, url = crawler.GetCompany(2)
page.goto(url)
page.wait_for_timeout(3000)
page.wait_for_selector(".input-wrap-text")
company_detail_el = page.locator('div.company-card-wrapper a')
company_detail_el_count = company_detail_el.count()
print("company_detail_el.count():", company_detail_el_count)
my_obj = {'intro': ""}
post_data = {
"name": company_nameO,
"content": json.dumps(my_obj),
"type": 2
}
if company_detail_el_count > 0:
company_detail_el.first.click()
page.wait_for_timeout(1000)
page.wait_for_selector("div.info h1.name")
company_name = page.locator("div.info h1.name").first.inner_text()
company_intro_el = page.locator("div.job-sec > div.fold-text")
if company_intro_el.count() > 0:
company_intro = company_intro_el.first.inner_text()
if company_name in company_nameO:
my_obj['intro'] = company_intro
if company_nameO in company_name:
my_obj['intro'] = company_intro
print('company_name', company_name)
print('company_intro', company_intro)
name2 = crawler.SaveCompanyData(company_nameO, 1, json.dumps(my_obj))
def GetBossCompany(p: Playwright) -> None:
browser = p.chromium.launch(headless=False)
context = browser.new_context()
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
page = context.new_page()
page.add_init_script(js)
# spider = BossSpider()
# spider.spider_company(page)
spider_company(page)
context.close()
browser.close()
#爬虫启动
def CrawlerBoss():
with sync_playwright() as playwright:
GetBossCompany(playwright)
LET Group Holdings Limited:写入成功
中国城市基础设施集团有限公司:写入成功
资本策略地产有限公司:写入成功
大昌微线集团有限公司:写入成功
非凡中国控股有限公司:写入成功
ASMPT Limited:写入成功
利时集团(控股)有限公司:写入成功
欣融国际控股有限公司:写入成功
宏安集团有限公司:写入成功
实力建业集团有限公司:写入成功
新秀丽国际有限公司:写入成功
中国金融租赁集团有限公司:写入成功
棠记(控股)有限公司:写入成功
亿和精密工业控股有限公司:写入成功
绿景(中国)地产投资有限公司:写入成功
天泓文创国际集团有限公司:写入成功
长虹佳华控股有限公司:写入成功
温岭浙江工量刃具交易中心股份有限公司:写入成功
环科国际集团有限公司:写入成功
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment