Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
crawler-py
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
jimmy蒋政彪
crawler-py
Commits
b2cfca39
Commit
b2cfca39
authored
Aug 30, 2023
by
jimmy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
提交
parent
cd864674
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
4 additions
and
6 deletions
+4
-6
crawler.cpython-311.pyc
__pycache__/crawler.cpython-311.pyc
+0
-0
crawler.py
crawler.py
+0
-2
crawler_baidu.py
crawler_baidu.py
+2
-2
crawler_boss.py
crawler_boss.py
+2
-2
No files found.
__pycache__/crawler.cpython-311.pyc
View file @
b2cfca39
No preview for this file type
crawler.py
View file @
b2cfca39
...
...
@@ -11,8 +11,6 @@ import cyaml
#获取需要爬数据的企业 百度1
def
GetCompany
(
types
):
response
=
requests
.
get
(
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"url"
]
+
cyaml
.
data
[
"php-api"
][
"getcompany"
]
+
"?type="
+
str
(
types
),
headers
=
{
"Content-Type"
:
"application/json"
})
print
(
response
)
print
(
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"url"
]
+
cyaml
.
data
[
"php-api"
][
"getcompany"
]
+
"?type="
+
str
(
types
))
if
response
.
status_code
==
200
:
response_data
=
response
.
json
()
# if response_data.get('name') is not None:
...
...
crawler_baidu.py
View file @
b2cfca39
...
...
@@ -2,7 +2,7 @@ from playwright.sync_api import Playwright, sync_playwright, expect
import
re
import
json
import
datetime
import
cyaml
import
crawler
#爬虫启动
...
...
@@ -12,7 +12,7 @@ def CrawlerBaidu():
#爬取百度的数据
def
GetBaiduCompany
(
playwright
:
Playwright
)
->
int
:
browser
=
playwright
.
chromium
.
launch
(
headless
=
False
)
browser
=
playwright
.
chromium
.
launch
(
headless
=
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"headless"
]
)
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
})
page
=
context
.
new_page
()
#获取需要爬取数据的公司
...
...
crawler_boss.py
View file @
b2cfca39
...
...
@@ -2,7 +2,7 @@ import crawler
import
requests
from
playwright.sync_api
import
Playwright
,
sync_playwright
import
json
import
cyaml
def
spider_company
(
page
):
company_nameO
,
url
=
crawler
.
GetCompany
(
2
)
...
...
@@ -37,7 +37,7 @@ def spider_company(page):
def
GetBossCompany
(
p
:
Playwright
)
->
None
:
browser
=
p
.
chromium
.
launch
(
headless
=
False
)
browser
=
p
.
chromium
.
launch
(
headless
=
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"headless"
]
)
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
})
js
=
"""
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment