Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
crawler-py
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
jimmy蒋政彪
crawler-py
Commits
85e1821f
Commit
85e1821f
authored
Aug 30, 2023
by
jimmy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
提交
parent
1538ad5f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
32 additions
and
28 deletions
+32
-28
crawler.cpython-311.pyc
__pycache__/crawler.cpython-311.pyc
+0
-0
crawler_boss.cpython-311.pyc
__pycache__/crawler_boss.cpython-311.pyc
+0
-0
crawler.py
crawler.py
+2
-1
crawler_boss.py
crawler_boss.py
+30
-27
No files found.
__pycache__/crawler.cpython-311.pyc
View file @
85e1821f
No preview for this file type
__pycache__/crawler_boss.cpython-311.pyc
View file @
85e1821f
No preview for this file type
crawler.py
View file @
85e1821f
...
@@ -14,7 +14,8 @@ def GetCompany(types):
...
@@ -14,7 +14,8 @@ def GetCompany(types):
if
response
.
status_code
==
200
:
if
response
.
status_code
==
200
:
response_data
=
response
.
json
()
response_data
=
response
.
json
()
# if response_data.get('name') is not None:
# if response_data.get('name') is not None:
return
"上海临方股权投资管理有限公司"
,
"https://www.zhipin.com/web/geek/job?query=
%
E5
%
A4
%
A7
%
E6
%97%8
F
%
E6
%
BF
%80%
E5
%85%89
&city=100010000"
#return "上海临方股权投资管理有限公司","https://www.zhipin.com/web/geek/job?query=%E5%A4%A7%E6%97%8F%E6%BF%80%E5%85%89&city=100010000"
#return "上海临方股权投资管理有限公司","https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E4%B8%B4%E6%96%B9%E8%82%A1%E6%9D%83%E6%8A%95%E8%B5%84%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8?fromModule=lemma_search-box"
return
response_data
[
"data"
][
"company_name"
],
response_data
[
"data"
][
"url"
]
return
response_data
[
"data"
][
"company_name"
],
response_data
[
"data"
][
"url"
]
return
""
,
""
return
""
,
""
...
...
crawler_boss.py
View file @
85e1821f
...
@@ -6,34 +6,37 @@ import cyaml
...
@@ -6,34 +6,37 @@ import cyaml
def
spider_company
(
page
):
def
spider_company
(
page
):
company_nameO
,
url
=
crawler
.
GetCompany
(
2
)
company_nameO
,
url
=
crawler
.
GetCompany
(
2
)
page
.
goto
(
url
)
if
url
!=
""
:
page
.
wait_for_timeout
(
3000
)
page
.
goto
(
url
)
page
.
wait_for_selector
(
".input-wrap-text"
)
page
.
wait_for_timeout
(
3000
)
company_detail_el
=
page
.
locator
(
'div.company-card-wrapper a'
)
page
.
wait_for_selector
(
".input-wrap-text"
)
company_detail_el_count
=
company_detail_el
.
count
()
company_detail_el
=
page
.
locator
(
'div.company-card-wrapper a'
)
crawler
.
Log
(
"company_detail_el.count():"
+
str
(
company_detail_el_count
))
company_detail_el_count
=
company_detail_el
.
count
()
crawler
.
Log
(
"company_detail_el.count():"
+
str
(
company_detail_el_count
))
my_obj
=
{
'intro'
:
""
}
if
company_detail_el_count
>
0
:
my_obj
=
{
'intro'
:
""
}
company_detail_el
.
first
.
click
()
if
company_detail_el_count
>
0
:
page
.
wait_for_timeout
(
1000
)
company_detail_el
.
first
.
click
()
page
.
wait_for_selector
(
"div.info h1.name"
)
page
.
wait_for_timeout
(
1000
)
company_name
=
page
.
locator
(
"div.info h1.name"
)
.
first
.
inner_text
()
page
.
wait_for_selector
(
"div.info h1.name"
)
company_intro_el
=
page
.
locator
(
"div.job-sec > div.fold-text"
)
company_name
=
page
.
locator
(
"div.info h1.name"
)
.
first
.
inner_text
()
if
company_intro_el
.
count
()
>
0
:
company_intro_el
=
page
.
locator
(
"div.job-sec > div.fold-text"
)
company_intro
=
company_intro_el
.
first
.
inner_text
()
if
company_intro_el
.
count
()
>
0
:
if
company_name
in
company_nameO
:
company_intro
=
company_intro_el
.
first
.
inner_text
()
my_obj
[
'intro'
]
=
company_intro
if
company_name
in
company_nameO
:
my_obj
[
'intro'
]
=
company_intro
if
company_nameO
in
company_name
:
my_obj
[
'intro'
]
=
company_intro
if
company_nameO
in
company_name
:
my_obj
[
'intro'
]
=
company_intro
crawler
.
Log
(
company_name
+
"-获取到boss数据:"
+
str
(
company_intro
))
name2
=
crawler
.
SaveCompanyData
(
company_nameO
,
2
,
str
(
company_intro
))
crawler
.
Log
(
company_name
+
"-获取到boss数据:"
+
str
(
company_intro
))
if
name2
!=
""
:
name2
=
crawler
.
SaveCompanyData
(
company_nameO
,
2
,
str
(
company_intro
))
crawler
.
Log
(
name2
+
":boss数据,写入成功"
)
if
name2
!=
""
:
crawler
.
Log
(
name2
+
":boss数据,写入成功"
)
else
:
crawler
.
Log
(
company_nameO
+
":boss数据,写入失败"
)
else
:
else
:
crawler
.
Log
(
company_nameO
+
":boss数据,写入失败
"
)
crawler
.
SaveCompanyData
(
company_nameO
,
2
,
"
"
)
def
GetBossCompany
(
p
:
Playwright
)
->
None
:
def
GetBossCompany
(
p
:
Playwright
)
->
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment