代码入残酷

7d4d8b4c · jimmy · 69c426c8 · 7d4d8b4c · 7d4d8b4c · 7d4d8b4c
Commit 7d4d8b4c authored Aug 29, 2023 by jimmy
17 changed files
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/pw-py.iml" filepath="$PROJECT_DIR$/.idea/pw-py.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/pw-py.iml
+++ b/.idea/pw-py.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.11" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
--- a/__pycache__/crawler.cpython-311.pyc
+++ b/__pycache__/crawler.cpython-311.pyc
--- a/__pycache__/crawler_baidu.cpython-311.pyc
+++ b/__pycache__/crawler_baidu.cpython-311.pyc
--- a/__pycache__/crawler_boss.cpython-311.pyc
+++ b/__pycache__/crawler_boss.cpython-311.pyc
--- a/__pycache__/test.cpython-311-pytest-7.4.0.pyc
+++ b/__pycache__/test.cpython-311-pytest-7.4.0.pyc
--- a/__pycache__/test_boss.cpython-311-pytest-7.4.0.pyc
+++ b/__pycache__/test_boss.cpython-311-pytest-7.4.0.pyc
--- a/__pycache__/test_boss2.cpython-311-pytest-7.4.0.pyc
+++ b/__pycache__/test_boss2.cpython-311-pytest-7.4.0.pyc
--- a/__pycache__/test_my_application.cpython-311-pytest-7.4.0.pyc
+++ b/__pycache__/test_my_application.cpython-311-pytest-7.4.0.pyc
--- a/crawler.py
+++ b/crawler.py
+import datetime
+import multiprocessing
+import time
+import json
+import crawler_baidu
+import crawler_boss
+
+import requests
+
+#获取需要爬数据的企业 百度1
+def GetCompany(types):
+    response = requests.get("http://192.168.25.19/smart-writing/company/spider-company", "type="+str(types), headers={"Content-Type": "application/json"})
+    if response.status_code == 200:
+        response_data = response.json()
+        return response_data["data"]["company_name"],response_data["data"]["url"]
+    else:
+        return "",""
+
+
+#回传爬虫获取到的数据
+def SaveCompanyData(name,types,intro):
+    my_obj={}
+    my_obj["intro"] = intro
+    post_data = {
+        "name": name,
+        "content": json.dumps(my_obj),
+        "type": types
+    }
+    json_data = json.dumps(post_data)
+    # 将 JSON 数据转换为字符串
+    print("json_data:", json_data)
+    # 发送 POST 请求并传递 JSON 数据
+    response = requests.post("http://192.168.25.19/smart-writing/company/spider-company", data=json_data, headers={"Content-Type": "application/json"})
+
+    # 检查响应状态码
+    if response.status_code == 200:
+        response_data = response.json()
+        if response_data["code"]==200:
+            return name
+        else:
+            return ""
+    else:
+        return ""
+
+
+
+#启动爬虫
+def CrawlerLaunch():
+    while True:
+        time.sleep(3)
+        now = datetime.datetime.now()
+
+        print(str(now)+":启动")
+        # crawler_baidu.CrawlerBaidu()
+        crawler_boss.CrawlerBoss()
+        # 在进程中执行的任务
+
+def Log():
+    file = open(str(datetime.date.today()) + ".txt", "a")
+
+
+if __name__ == "__main__":
+    p = multiprocessing.Process(target=CrawlerLaunch)
+    p.start()
+    p.join()
+
--- a/crawler_baidu.py
+++ b/crawler_baidu.py
+from playwright.sync_api import Playwright, sync_playwright, expect
+
+import re
+import json
+import datetime
+import crawler
+
+
+
+#爬虫启动
+def CrawlerBaidu():
+    with sync_playwright() as playwright:
+        GetBaiduCompany(playwright)
+
+#爬取百度的数据
+def GetBaiduCompany(playwright: Playwright) -> None:
+    browser = playwright.chromium.launch(headless=True)
+    context = browser.new_context(viewport={"width": 800, "height": 600})
+    page = context.new_page()
+    #获取需要爬取数据的公司
+    name,url = crawler.GetCompany(1)
+    if url!="":
+        page.goto(url)
+        all = page.locator(".lemma-summary").all_text_contents()
+        intro = re.sub(r'\[[\d-]+\]', '', str(all))
+        name2 = crawler.SaveCompanyData(name,1,intro)
+        if name2 != "" and intro != None:
+            file = open(str(datetime.date.today()) + ".txt", "a")
+
+            file.write(name2+":写入成功\n")
+            file.close()
+        else:
+            file = open("example.txt", "a")
+            file.write(name2+":写入失败\n")
+            file.close()
+
+        # ---------------------
+        context.close()
+        browser.close()
+
--- a/crawler_boss.py
+++ b/crawler_boss.py
+import crawler
+import requests
+from playwright.sync_api import Playwright, sync_playwright
+import json
+
+
+def spider_company(page):
+    company_nameO, url = crawler.GetCompany(2)
+    page.goto(url)
+    page.wait_for_timeout(3000)
+    page.wait_for_selector(".input-wrap-text")
+    company_detail_el = page.locator('div.company-card-wrapper a')
+    company_detail_el_count = company_detail_el.count()
+    print("company_detail_el.count():", company_detail_el_count)
+    my_obj = {'intro': ""}
+    post_data = {
+        "name": company_nameO,
+        "content": json.dumps(my_obj),
+        "type": 2
+    }
+    if company_detail_el_count > 0:
+        company_detail_el.first.click()
+        page.wait_for_timeout(1000)
+        page.wait_for_selector("div.info h1.name")
+        company_name = page.locator("div.info h1.name").first.inner_text()
+        company_intro_el = page.locator("div.job-sec > div.fold-text")
+        if company_intro_el.count() > 0:
+            company_intro = company_intro_el.first.inner_text()
+            if company_name in company_nameO:
+                my_obj['intro'] = company_intro
+
+            if company_nameO in company_name:
+                my_obj['intro'] = company_intro
+
+            print('company_name', company_name)
+            print('company_intro', company_intro)
+    name2 = crawler.SaveCompanyData(company_nameO, 1, json.dumps(my_obj))
+
+def GetBossCompany(p: Playwright) -> None:
+    browser = p.chromium.launch(headless=False)
+    context = browser.new_context()
+    js = """
+    Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
+    """
+    page = context.new_page()
+    page.add_init_script(js)
+    # spider = BossSpider()
+    # spider.spider_company(page)
+    spider_company(page)
+
+    context.close()
+    browser.close()
+
+#爬虫启动
+def CrawlerBoss():
+    with sync_playwright() as playwright:
+        GetBossCompany(playwright)
+
--- a/example.txt
+++ b/example.txt
+LET Group Holdings Limited:写入成功
+中国城市基础设施集团有限公司:写入成功
+资本策略地产有限公司:写入成功
+大昌微线集团有限公司:写入成功
+非凡中国控股有限公司:写入成功
+ASMPT Limited:写入成功
+利时集团(控股)有限公司:写入成功
+欣融国际控股有限公司:写入成功
+宏安集团有限公司:写入成功
+实力建业集团有限公司:写入成功
+新秀丽国际有限公司:写入成功
+中国金融租赁集团有限公司:写入成功
+棠记(控股)有限公司:写入成功
+亿和精密工业控股有限公司:写入成功
+绿景(中国)地产投资有限公司:写入成功
+天泓文创国际集团有限公司:写入成功
+长虹佳华控股有限公司:写入成功
+温岭浙江工量刃具交易中心股份有限公司:写入成功
+环科国际集团有限公司:写入成功
--- a/requirements.txt
+++ b/requirements.txt
+playwright==1.37.0
+Requests==2.31.0