Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
crawler-py
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
jimmy蒋政彪
crawler-py
Commits
b7d0d80b
Commit
b7d0d80b
authored
Aug 29, 2023
by
jimmy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
提交两个一起的爬虫
parent
72e607c7
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
44 additions
and
15 deletions
+44
-15
2023-08-29.txt
2023-08-29.txt
+19
-0
crawler.cpython-311.pyc
__pycache__/crawler.cpython-311.pyc
+0
-0
crawler_baidu.cpython-311.pyc
__pycache__/crawler_baidu.cpython-311.pyc
+0
-0
crawler_boss.cpython-311.pyc
__pycache__/crawler_boss.cpython-311.pyc
+0
-0
crawler.py
crawler.py
+4
-6
crawler_baidu.py
crawler_baidu.py
+13
-6
crawler_boss.py
crawler_boss.py
+8
-3
No files found.
2023-08-29.txt
View file @
b7d0d80b
久久王食品国际有限公司:写入成功
久久王食品国际有限公司:写入成功
阳光油砂有限公司:写入成功
双桦控股有限公司:写入成功
联亚集团有限公司:写入成功
理文造纸有限公司:写入成功
昆仑国际金融集团有限公司:写入成功
银涛控股有限公司-获取到百度数据:[]
银涛控股有限公司:百度数据,写入成功
香港信贷集团有限公司-获取到百度数据:[]
香港信贷集团有限公司:百度数据,写入成功
东方大学城控股(香港)有限公司-获取到百度数据:[]
东方大学城控股(香港)有限公司:百度数据,写入成功
高鹏矿业控股有限公司-获取到百度数据:[]
高鹏矿业控股有限公司:百度数据,写入失败
志高控股有限公司-获取到百度数据:[]
志高控股有限公司:百度数据,写入失败
名创优品集团控股有限公司-获取到百度数据:['\n名创优品集团控股有限公司于2020年1月7日在开曼群岛注册成立,是一家主要从事生活家居产品以及潮流玩具产品的零售及批发业务的中国控股公司。\n\xa0\n\n\xa0\n自2013年在中国开设第一家门店以来的九年时间里,公司已成功孵化了两个品牌,分别是名创优品和TOP TOY。\n\xa0\n2022年8月,名创优品因将穿旗袍的娃娃称为日本艺伎而备受舆论争议。据媒体报道,名创优品公司在与希腊、罗马尼亚、保加利亚等国外合作方签约时,现场悬挂的为日本国旗,且创始人叶国富等高管均参与合影。\n\xa0\n\n']
名创优品集团控股有限公司:百度数据,写入成功
上海大众公用事业(集团)股份有限公司-获取到百度数据:['\n公司是上海乃至华东地区最重要的燃气供应商之一,控股50%的上海大众燃气是国内首家产权多元化的大型城市燃气企业。大众燃气拥有上海浦西地区苏州河以南8个行政区的燃气客户134万户,占据着上海燃气销售市场40%的市场份额。投资4亿元收购上海燃气市南销售有限公司50%的股权,同时也配合西气东输工程,对燃气管网进行改造和建设。公司拥有燃气地下管网总长达5380公里,燃气用户达175万户。公司参股银行、保险、券商等众多金融类公司,同时公司投资2.75亿元参股深圳市创新投资集团有限公司,该公司是一家来自深圳的高科技企业孵化器,旗下大量项目有望在创业板推出后实现上市。\n']
上海大众公用事业(集团)股份有限公司:百度数据,写入成功
__pycache__/crawler.cpython-311.pyc
View file @
b7d0d80b
No preview for this file type
__pycache__/crawler_baidu.cpython-311.pyc
View file @
b7d0d80b
No preview for this file type
__pycache__/crawler_boss.cpython-311.pyc
View file @
b7d0d80b
No preview for this file type
crawler.py
View file @
b7d0d80b
...
@@ -27,9 +27,6 @@ def SaveCompanyData(name,types,intro):
...
@@ -27,9 +27,6 @@ def SaveCompanyData(name,types,intro):
"type"
:
types
"type"
:
types
}
}
json_data
=
json
.
dumps
(
post_data
)
json_data
=
json
.
dumps
(
post_data
)
# 将 JSON 数据转换为字符串
print
(
"json_data:"
,
json_data
)
# 发送 POST 请求并传递 JSON 数据
response
=
requests
.
post
(
"http://192.168.25.19/smart-writing/company/spider-company"
,
data
=
json_data
,
headers
=
{
"Content-Type"
:
"application/json"
})
response
=
requests
.
post
(
"http://192.168.25.19/smart-writing/company/spider-company"
,
data
=
json_data
,
headers
=
{
"Content-Type"
:
"application/json"
})
# 检查响应状态码
# 检查响应状态码
...
@@ -49,11 +46,12 @@ def CrawlerLaunch():
...
@@ -49,11 +46,12 @@ def CrawlerLaunch():
while
True
:
while
True
:
now
=
datetime
.
datetime
.
now
()
now
=
datetime
.
datetime
.
now
()
randomtime
=
random
.
randint
(
3
,
120
)
randomtime
=
random
.
randint
(
3
,
120
)
randomtime
=
5
print
(
str
(
now
)
+
":启动等待时间"
+
str
(
randomtime
))
print
(
str
(
now
)
+
":启动等待时间"
+
str
(
randomtime
))
time
.
sleep
(
randomtime
)
time
.
sleep
(
randomtime
)
crawler_baidu
.
CrawlerBaidu
()
res1
=
crawler_baidu
.
CrawlerBaidu
()
#crawler_boss.CrawlerBoss()
if
res1
==
100
:
# 在进程中执行的任务
crawler_boss
.
CrawlerBoss
()
def
Log
(
dataS
):
def
Log
(
dataS
):
...
...
crawler_baidu.py
View file @
b7d0d80b
...
@@ -11,7 +11,7 @@ def CrawlerBaidu():
...
@@ -11,7 +11,7 @@ def CrawlerBaidu():
GetBaiduCompany
(
playwright
)
GetBaiduCompany
(
playwright
)
#爬取百度的数据
#爬取百度的数据
def
GetBaiduCompany
(
playwright
:
Playwright
)
->
None
:
def
GetBaiduCompany
(
playwright
:
Playwright
)
->
int
:
browser
=
playwright
.
chromium
.
launch
(
headless
=
False
)
browser
=
playwright
.
chromium
.
launch
(
headless
=
False
)
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
})
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
})
page
=
context
.
new_page
()
page
=
context
.
new_page
()
...
@@ -21,13 +21,20 @@ def GetBaiduCompany(playwright: Playwright) -> None:
...
@@ -21,13 +21,20 @@ def GetBaiduCompany(playwright: Playwright) -> None:
page
.
goto
(
url
)
page
.
goto
(
url
)
all
=
page
.
locator
(
".lemma-summary"
)
.
all_text_contents
()
all
=
page
.
locator
(
".lemma-summary"
)
.
all_text_contents
()
intro
=
re
.
sub
(
r'\[[\d-]+\]'
,
''
,
str
(
all
))
intro
=
re
.
sub
(
r'\[[\d-]+\]'
,
''
,
str
(
all
))
name2
=
crawler
.
SaveCompanyData
(
name
,
1
,
intro
)
crawler
.
Log
(
name
+
"-获取到百度数据:"
+
intro
)
if
name2
!=
""
and
intro
!=
None
:
if
name
!=
""
and
intro
!=
'[]'
:
crawler
.
Log
(
name2
+
":写入成功"
)
name2
=
crawler
.
SaveCompanyData
(
name
,
1
,
intro
)
if
name2
!=
""
:
crawler
.
Log
(
name2
+
":百度数据,写入成功"
)
return
100
else
:
else
:
crawler
.
Log
(
name2
+
":写入失败"
)
crawler
.
Log
(
name
+
":百度数据,写入失败"
)
return
100
else
:
crawler
.
Log
(
name
+
":百度数据,写入失败"
)
return
100
# ---------------------
# ---------------------
context
.
close
()
context
.
close
()
browser
.
close
()
browser
.
close
()
return
100
crawler_boss.py
View file @
b7d0d80b
...
@@ -11,7 +11,8 @@ def spider_company(page):
...
@@ -11,7 +11,8 @@ def spider_company(page):
page
.
wait_for_selector
(
".input-wrap-text"
)
page
.
wait_for_selector
(
".input-wrap-text"
)
company_detail_el
=
page
.
locator
(
'div.company-card-wrapper a'
)
company_detail_el
=
page
.
locator
(
'div.company-card-wrapper a'
)
company_detail_el_count
=
company_detail_el
.
count
()
company_detail_el_count
=
company_detail_el
.
count
()
print
(
"company_detail_el.count():"
,
company_detail_el_count
)
crawler
.
Log
(
"company_detail_el.count():"
+
company_detail_el_count
)
my_obj
=
{
'intro'
:
""
}
my_obj
=
{
'intro'
:
""
}
if
company_detail_el_count
>
0
:
if
company_detail_el_count
>
0
:
company_detail_el
.
first
.
click
()
company_detail_el
.
first
.
click
()
...
@@ -26,10 +27,14 @@ def spider_company(page):
...
@@ -26,10 +27,14 @@ def spider_company(page):
if
company_nameO
in
company_name
:
if
company_nameO
in
company_name
:
my_obj
[
'intro'
]
=
company_intro
my_obj
[
'intro'
]
=
company_intro
crawler
.
Log
(
company_name
+
"-获取到boss数据:"
+
company_intro
)
print
(
'company_name'
,
company_name
)
print
(
'company_intro'
,
company_intro
)
name2
=
crawler
.
SaveCompanyData
(
company_nameO
,
1
,
json
.
dumps
(
my_obj
))
name2
=
crawler
.
SaveCompanyData
(
company_nameO
,
1
,
json
.
dumps
(
my_obj
))
if
name2
!=
""
:
crawler
.
Log
(
name2
+
":boss数据,写入成功"
)
else
:
crawler
.
Log
(
company_nameO
+
":boss数据,写入失败"
)
def
GetBossCompany
(
p
:
Playwright
)
->
None
:
def
GetBossCompany
(
p
:
Playwright
)
->
None
:
browser
=
p
.
chromium
.
launch
(
headless
=
True
)
browser
=
p
.
chromium
.
launch
(
headless
=
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment