Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
crawler-py
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
jimmy蒋政彪
crawler-py
Commits
90a51741
Commit
90a51741
authored
Oct 08, 2023
by
jimmy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
提交数据
parent
0c9219f3
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
162 additions
and
114 deletions
+162
-114
crawler.cpython-311.pyc
__pycache__/crawler.cpython-311.pyc
+0
-0
crawler_baidu.cpython-311.pyc
__pycache__/crawler_baidu.cpython-311.pyc
+0
-0
crawler_boss.cpython-311.pyc
__pycache__/crawler_boss.cpython-311.pyc
+0
-0
crawler.py
crawler.py
+40
-8
crawler_baidu.py
crawler_baidu.py
+28
-26
crawler_boss.py
crawler_boss.py
+39
-39
crawler_futuniuniu.py
crawler_futuniuniu.py
+0
-41
crawler_qqdoc.py
crawler_qqdoc.py
+55
-0
No files found.
__pycache__/crawler.cpython-311.pyc
View file @
90a51741
No preview for this file type
__pycache__/crawler_baidu.cpython-311.pyc
View file @
90a51741
No preview for this file type
__pycache__/crawler_boss.cpython-311.pyc
View file @
90a51741
No preview for this file type
crawler.py
View file @
90a51741
...
@@ -13,9 +13,10 @@ def GetCompany(types):
...
@@ -13,9 +13,10 @@ def GetCompany(types):
response
=
requests
.
get
(
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"url"
]
+
cyaml
.
data
[
"php-api"
][
"getcompany"
]
+
"?type="
+
str
(
types
),
headers
=
{
"Content-Type"
:
"application/json"
})
response
=
requests
.
get
(
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"url"
]
+
cyaml
.
data
[
"php-api"
][
"getcompany"
]
+
"?type="
+
str
(
types
),
headers
=
{
"Content-Type"
:
"application/json"
})
if
response
.
status_code
==
200
:
if
response
.
status_code
==
200
:
response_data
=
response
.
json
()
response_data
=
response
.
json
()
# if response_data.get('name') is not None:
if
types
==
88
:
#boss测试
#return "上海临方股权投资管理有限公司","https://www.zhipin.com/web/geek/job?query=%E5%A4%A7%E6%97%8F%E6%BF%80%E5%85%89&city=100010000"
return
"上海临方股权投资管 理有限公司"
,
"https://www.zhipin.com/web/geek/job?query=
%
E5
%
A4
%
A7
%
E6
%97%8
F
%
E6
%
BF
%80%
E5
%85%89
&city=100010000"
#return "上海临方股权投资管理有限公司","https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E4%B8%B4%E6%96%B9%E8%82%A1%E6%9D%83%E6%8A%95%E8%B5%84%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8?fromModule=lemma_search-box"
elif
types
==
99
:
#baidu baike
return
"上海临方股权投资管理有限公司"
,
"https://baike.baidu.com/item/
%
E4
%
B8
%8
A
%
E6
%
B5
%
B7
%
E4
%
B8
%
B4
%
E6
%96%
B9
%
E8
%82%
A1
%
E6
%9
D
%83%
E6
%8
A
%95%
E8
%
B5
%84%
E7
%
AE
%
A1
%
E7
%90%86%
E6
%9
C
%89%
E9
%99%90%
E5
%85%
AC
%
E5
%8
F
%
B8?fromModule=lemma_search-box"
return
response_data
[
"data"
][
"company_name"
],
response_data
[
"data"
][
"url"
]
return
response_data
[
"data"
][
"company_name"
],
response_data
[
"data"
][
"url"
]
return
""
,
""
return
""
,
""
...
@@ -45,16 +46,47 @@ def SaveCompanyData(name,types,intro):
...
@@ -45,16 +46,47 @@ def SaveCompanyData(name,types,intro):
#启动爬虫
#启动爬虫
def
CrawlerLaunchTypes
():
types
=
88
match
types
:
case
99
:
print
(
"百度百科 测试"
)
com_name1
,
baike_url
=
GetCompany
(
99
)
# 1 获取公司名字 和 百科链接
if
baike_url
!=
""
:
crawler_baidu
.
CrawlerBaidu
(
com_name1
,
baike_url
)
else
:
Log
(
com_name1
+
"-百度 url为空:"
)
case
88
:
print
(
"boss 测试"
)
while
(
True
):
time
.
sleep
(
2
)
com_name2
,
boss_url
=
GetCompany
(
88
)
# 2 获取公司名字 和 boss链接
if
boss_url
!=
""
:
crawler_boss
.
CrawlerBoss
(
com_name2
,
boss_url
)
else
:
Log
(
com_name2
+
"-boss url为空:"
)
case
_
:
CrawlerLaunch
()
def
CrawlerLaunch
():
def
CrawlerLaunch
():
while
True
:
while
True
:
now
=
datetime
.
datetime
.
now
()
now
=
datetime
.
datetime
.
now
()
randomtime
=
random
.
randint
(
3
,
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"randomtime"
])
randomtime
=
random
.
randint
(
3
,
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"randomtime"
])
Log
(
str
(
now
)
+
":启动等待时间"
+
str
(
randomtime
))
print
(
str
(
now
)
+
":启动等待时间"
+
str
(
randomtime
))
print
(
str
(
now
)
+
":启动等待时间"
+
str
(
randomtime
))
time
.
sleep
(
randomtime
)
time
.
sleep
(
randomtime
)
res1
=
crawler_baidu
.
CrawlerBaidu
()
if
res1
==
100
:
com_name1
,
baike_url
=
GetCompany
(
1
)
#1 获取公司名字 和 百科链接
time
.
sleep
(
5
)
if
baike_url
!=
""
:
crawler_boss
.
CrawlerBoss
()
crawler_baidu
.
CrawlerBaidu
(
com_name1
,
baike_url
)
else
:
Log
(
com_name1
+
"-百度 url为空:"
)
com_name2
,
boss_url
=
GetCompany
(
2
)
#2 获取公司名字 和 boss链接
if
boss_url
!=
""
:
crawler_boss
.
CrawlerBoss
(
com_name2
,
boss_url
)
else
:
Log
(
com_name2
+
"-boss url为空:"
)
def
Log
(
dataS
):
def
Log
(
dataS
):
with
open
(
"log/"
+
str
(
datetime
.
date
.
today
())
+
".txt"
,
"a+"
)
as
f
:
with
open
(
"log/"
+
str
(
datetime
.
date
.
today
())
+
".txt"
,
"a+"
)
as
f
:
...
@@ -63,7 +95,7 @@ def Log(dataS):
...
@@ -63,7 +95,7 @@ def Log(dataS):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
p
=
multiprocessing
.
Process
(
target
=
CrawlerLaunch
)
p
=
multiprocessing
.
Process
(
target
=
CrawlerLaunch
Types
)
p
.
start
()
p
.
start
()
p
.
join
()
p
.
join
()
crawler_baidu.py
View file @
90a51741
...
@@ -6,23 +6,25 @@ import cyaml
...
@@ -6,23 +6,25 @@ import cyaml
import
crawler
import
crawler
#爬虫启动
#爬虫启动
def
CrawlerBaidu
():
def
CrawlerBaidu
(
com_name
,
baike_url
):
with
sync_playwright
()
as
playwright
:
with
sync_playwright
()
as
playwright
:
return
GetBaiduCompany
(
playwright
)
return
GetBaiduCompany
(
playwright
,
com_name
,
baike_url
)
#爬取百度的数据
#爬取百度的数据
def
GetBaiduCompany
(
playwright
:
Playwright
)
->
int
:
def
GetBaiduCompany
(
playwright
:
Playwright
,
com_name
,
baike_url
)
->
int
:
browser
=
playwright
.
chromium
.
launch
(
headless
=
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"headless"
])
browser
=
playwright
.
chromium
.
launch
(
headless
=
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"headless"
])
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
})
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
})
page
=
context
.
new_page
()
page
=
context
.
new_page
()
name
=
com_name
url
=
baike_url
#获取需要爬取数据的公司
#获取需要爬取数据的公司
name
,
url
=
crawler
.
GetCompany
(
1
)
crawler
.
Log
(
"--------------------------baidu baike start--------------------------------"
)
crawler
.
Log
(
name
+
"-百度开始请求数据
:"
+
url
)
crawler
.
Log
(
"开始请求 "
+
name
+
" 公司的百科地址,地址为
:"
+
url
)
if
url
!=
""
:
page
.
goto
(
url
)
page
.
goto
(
url
)
all
=
page
.
locator
(
".lemma-summary"
)
.
all_text_contents
()
all
=
page
.
locator
(
".lemma-summary"
)
.
all_text_contents
()
intro
=
re
.
sub
(
r'\[[\d-]+\]'
,
''
,
str
(
all
))
intro
=
re
.
sub
(
r'\[[\d-]+\]'
,
''
,
str
(
all
))
crawler
.
Log
(
name
+
"-获取到百度
数据:"
+
intro
)
crawler
.
Log
(
name
+
" 公司的百科地址上抓取到的
数据:"
+
intro
)
if
name
!=
""
and
intro
!=
'[]'
:
if
name
!=
""
and
intro
!=
'[]'
:
intro_new
=
str
(
intro
[
2
:
len
(
intro
)
-
2
])
intro_new
=
str
(
intro
[
2
:
len
(
intro
)
-
2
])
new_string1
=
re
.
sub
(
r'\\n'
,
""
,
intro_new
)
new_string1
=
re
.
sub
(
r'\\n'
,
""
,
intro_new
)
...
@@ -30,14 +32,14 @@ def GetBaiduCompany(playwright: Playwright) -> int:
...
@@ -30,14 +32,14 @@ def GetBaiduCompany(playwright: Playwright) -> int:
name2
=
crawler
.
SaveCompanyData
(
name
,
1
,
new_string2
)
name2
=
crawler
.
SaveCompanyData
(
name
,
1
,
new_string2
)
if
name2
!=
""
:
if
name2
!=
""
:
crawler
.
Log
(
name2
+
":百度数据,写入
成功"
)
crawler
.
Log
(
name2
+
" 公司百科数据回传到php 回传
成功"
)
return
100
return
100
else
:
else
:
crawler
.
Log
(
name
+
":百度数据,写入
失败"
)
crawler
.
Log
(
name
+
"公司百科数据回传到php 回传
失败"
)
return
100
return
100
else
:
else
:
name2
=
crawler
.
SaveCompanyData
(
name
,
1
,
""
)
crawler
.
SaveCompanyData
(
name
,
1
,
""
)
crawler
.
Log
(
name
+
":百度数据,写入失败
"
)
crawler
.
Log
(
name
+
" 公司没有获取到百科数据回传到php 回传成功
"
)
return
100
return
100
# ---------------------
# ---------------------
...
...
crawler_boss.py
View file @
90a51741
...
@@ -4,17 +4,16 @@ from playwright.sync_api import Playwright, sync_playwright
...
@@ -4,17 +4,16 @@ from playwright.sync_api import Playwright, sync_playwright
import
json
import
json
import
cyaml
import
cyaml
def
spider_company
(
page
):
def
spider_company
(
page
,
com_name
,
boss_url
):
company_nameO
,
url
=
crawler
.
GetCompany
(
2
)
company_nameO
=
com_name
crawler
.
Log
(
company_nameO
+
"-boss开始请求数据:"
+
url
)
url
=
boss_url
if
url
!=
""
:
crawler
.
Log
(
"--------------------------boss start--------------------------------"
)
crawler
.
Log
(
"开始请求 "
+
company_nameO
+
" 公司的boss地址,地址为:"
+
url
)
page
.
goto
(
url
)
page
.
goto
(
url
)
page
.
wait_for_timeout
(
3000
)
page
.
wait_for_timeout
(
3000
)
page
.
wait_for_selector
(
".input-wrap-text"
)
page
.
wait_for_selector
(
".input-wrap-text"
)
company_detail_el
=
page
.
locator
(
'div.company-card-wrapper a'
)
company_detail_el
=
page
.
locator
(
'div.company-card-wrapper a'
)
company_detail_el_count
=
company_detail_el
.
count
()
company_detail_el_count
=
company_detail_el
.
count
()
crawler
.
Log
(
"company_detail_el.count():"
+
str
(
company_detail_el_count
))
my_obj
=
{
'intro'
:
""
}
my_obj
=
{
'intro'
:
""
}
if
company_detail_el_count
>
0
:
if
company_detail_el_count
>
0
:
company_detail_el
.
first
.
click
()
company_detail_el
.
first
.
click
()
...
@@ -30,19 +29,20 @@ def spider_company(page):
...
@@ -30,19 +29,20 @@ def spider_company(page):
if
company_nameO
in
company_name
:
if
company_nameO
in
company_name
:
my_obj
[
'intro'
]
=
company_intro
my_obj
[
'intro'
]
=
company_intro
crawler
.
Log
(
company_name
+
"-获取到boss数据:"
+
str
(
company_intro
))
crawler
.
Log
(
company_name
+
" 公司的百科地址上抓取到的数据:"
+
str
(
company_intro
))
name2
=
crawler
.
SaveCompanyData
(
company_nameO
,
2
,
str
(
company_intro
))
name2
=
crawler
.
SaveCompanyData
(
company_nameO
,
2
,
str
(
company_intro
))
if
name2
!=
""
:
if
name2
!=
""
:
crawler
.
Log
(
name2
+
":boss数据,写入
成功"
)
crawler
.
Log
(
name2
+
" 公司boss数据回传到php 回传
成功"
)
else
:
else
:
crawler
.
Log
(
company_nameO
+
":boss数据,写入
失败"
)
crawler
.
Log
(
company_nameO
+
" 公司boss数据回传到php 回传
失败"
)
else
:
else
:
crawler
.
SaveCompanyData
(
company_nameO
,
2
,
""
)
name
=
crawler
.
SaveCompanyData
(
company_nameO
,
2
,
""
)
if
name
!=
""
:
crawler
.
Log
(
company_nameO
+
" 公司没有获取到boss数据回传到php 回传成功"
)
else
:
else
:
crawler
.
SaveCompanyData
(
company_nameO
,
2
,
""
)
crawler
.
Log
(
company_nameO
+
" 公司没有获取到boss数据回传到php 回传失败"
)
def
GetBossCompany
(
p
:
Playwright
)
->
None
:
def
GetBossCompany
(
p
:
Playwright
,
com_name
,
boss_url
)
->
None
:
browser
=
p
.
chromium
.
launch
(
headless
=
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"headless"
])
browser
=
p
.
chromium
.
launch
(
headless
=
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"headless"
])
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
})
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
})
js
=
"""
js
=
"""
...
@@ -50,13 +50,13 @@ def GetBossCompany(p: Playwright) -> None:
...
@@ -50,13 +50,13 @@ def GetBossCompany(p: Playwright) -> None:
"""
"""
page
=
context
.
new_page
()
page
=
context
.
new_page
()
page
.
add_init_script
(
js
)
page
.
add_init_script
(
js
)
spider_company
(
page
)
spider_company
(
page
,
com_name
,
boss_url
)
context
.
close
()
context
.
close
()
browser
.
close
()
browser
.
close
()
#爬虫启动
#爬虫启动
def
CrawlerBoss
():
def
CrawlerBoss
(
com_name
,
boss_url
):
with
sync_playwright
()
as
playwright
:
with
sync_playwright
()
as
playwright
:
GetBossCompany
(
playwright
)
GetBossCompany
(
playwright
,
com_name
,
boss_url
)
crawler_futuniuniu.py
deleted
100644 → 0
View file @
0c9219f3
from
playwright.sync_api
import
Playwright
,
sync_playwright
,
expect
import
re
import
json
import
datetime
import
crawler
#爬虫启动
def
CrawlerFutuNiuniu
():
with
sync_playwright
()
as
playwright
:
return
GetFutuCompany
(
playwright
)
def
GetFutuCompany
(
playwright
:
Playwright
)
->
int
:
browser
=
playwright
.
chromium
.
launch
(
headless
=
False
)
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
})
page
=
context
.
new_page
()
#获取需要爬取数据的公司
name
,
url
=
crawler
.
GetCompany
(
1
)
if
url
!=
""
:
page
.
goto
(
url
)
all
=
page
.
locator
(
".lemma-summary"
)
.
all_text_contents
()
intro
=
re
.
sub
(
r'\[[\d-]+\]'
,
''
,
str
(
all
))
crawler
.
Log
(
name
+
"-获取到futu数据:"
+
intro
)
if
name
!=
""
and
intro
!=
'[]'
:
name2
=
crawler
.
SaveCompanyData
(
name
,
1
,
intro
)
if
name2
!=
""
:
crawler
.
Log
(
name2
+
":百度数据,写入成功"
)
return
100
else
:
crawler
.
Log
(
name
+
":百度数据,写入失败"
)
return
100
else
:
crawler
.
Log
(
name
+
":百度数据,写入失败"
)
return
100
# ---------------------
context
.
close
()
browser
.
close
()
return
100
CrawlerFutuNiuniu
()
\ No newline at end of file
crawler_qqdoc.py
0 → 100644
View file @
90a51741
from
playwright.sync_api
import
Playwright
,
sync_playwright
,
expect
import
re
import
json
import
cyaml
import
crawler
#爬虫启动
def
CrawlerQqdoc
(
com_name
,
url
):
with
sync_playwright
()
as
playwright
:
return
GetQqdocCompany
(
playwright
,
com_name
,
url
)
#爬取百度的数据
def
GetQqdocCompany
(
playwright
:
Playwright
,
com_name
,
url
)
->
int
:
browser
=
playwright
.
chromium
.
launch
(
headless
=
cyaml
.
data
[
cyaml
.
data
[
"env"
]][
"headless"
])
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
})
page
=
context
.
new_page
()
name
=
com_name
#获取需要爬取数据的公司
crawler
.
Log
(
"--------------------------qqdoc start--------------------------------"
)
page
.
goto
(
url
)
page
.
wait_for_timeout
(
8000
)
page
.
wait_for_selector
(
".melo-page-container-view"
)
# page.locator(".melo-page-container-view").all_text_contents()
# page.frame_locator("iframe[name=\"login_frame\"]").frame_locator("iframe").get_by_role("link", name="Continue to use in browser").click()
page
.
get_by_label
(
"腾讯文档正文内容"
)
.
press
(
"Meta+a"
)
page
.
get_by_label
(
"腾讯文档正文内容"
)
.
press
(
"Meta+c"
)
all
=
page
.
context
intro
=
re
.
sub
(
r'\[[\d-]+\]'
,
''
,
str
(
all
))
crawler
.
Log
(
name
+
" qqdoc上抓取到的数据:"
+
intro
)
if
name
!=
""
and
intro
!=
'[]'
:
intro_new
=
str
(
intro
[
2
:
len
(
intro
)
-
2
])
new_string1
=
re
.
sub
(
r'\\n'
,
""
,
intro_new
)
new_string2
=
re
.
sub
(
r'\\xa0'
,
""
,
new_string1
)
name2
=
crawler
.
SaveCompanyData
(
name
,
1
,
new_string2
)
if
name2
!=
""
:
crawler
.
Log
(
name2
+
" qqdoc数据回传到php 回传成功"
)
return
100
else
:
crawler
.
Log
(
name
+
"qqdoc数据回传到php 回传失败"
)
return
100
else
:
crawler
.
SaveCompanyData
(
name
,
1
,
""
)
crawler
.
Log
(
name
+
" 公司没有获取到qqdoc数据回传到php 回传成功"
)
return
100
# ---------------------
context
.
close
()
browser
.
close
()
return
100
CrawlerQqdoc
(
"ce"
,
"https://doc.weixin.qq.com/doc/w3_AXEAcwZhACknEowzTBmRw6jzHlbf9?scode=AMwA6QetAAYQxyo3EtAcQADQaTAHI"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment