Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
crawler-py
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
jimmy蒋政彪
crawler-py
Commits
adcf2d90
Commit
adcf2d90
authored
Aug 29, 2023
by
jimmy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
提交
parent
7d4d8b4c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
13 additions
and
16 deletions
+13
-16
2023-08-29.txt
2023-08-29.txt
+1
-0
crawler.py
crawler.py
+9
-6
crawler_baidu.py
crawler_baidu.py
+2
-9
crawler_boss.py
crawler_boss.py
+1
-1
No files found.
2023-08-29.txt
0 → 100644
View file @
adcf2d90
久久王食品国际有限公司:写入成功
crawler.py
View file @
adcf2d90
...
@@ -4,7 +4,7 @@ import time
...
@@ -4,7 +4,7 @@ import time
import
json
import
json
import
crawler_baidu
import
crawler_baidu
import
crawler_boss
import
crawler_boss
import
random
import
requests
import
requests
#获取需要爬数据的企业 百度1
#获取需要爬数据的企业 百度1
...
@@ -47,17 +47,20 @@ def SaveCompanyData(name,types,intro):
...
@@ -47,17 +47,20 @@ def SaveCompanyData(name,types,intro):
#启动爬虫
#启动爬虫
def
CrawlerLaunch
():
def
CrawlerLaunch
():
while
True
:
while
True
:
time
.
sleep
(
3
)
randomtime
=
random
.
randint
(
3
,
100
)
time
.
sleep
(
randomtime
)
now
=
datetime
.
datetime
.
now
()
now
=
datetime
.
datetime
.
now
()
print
(
str
(
now
)
+
":启动"
)
print
(
str
(
now
)
+
":启动"
)
#
crawler_baidu.CrawlerBaidu()
crawler_baidu
.
CrawlerBaidu
()
crawler_boss
.
CrawlerBoss
()
#
crawler_boss.CrawlerBoss()
# 在进程中执行的任务
# 在进程中执行的任务
def
Log
():
file
=
open
(
str
(
datetime
.
date
.
today
())
+
".txt"
,
"a"
)
def
Log
(
dataS
):
file
=
open
(
str
(
datetime
.
date
.
today
())
+
".txt"
,
"a+"
)
file
.
write
(
dataS
+
"
\n
"
)
file
.
close
()
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
p
=
multiprocessing
.
Process
(
target
=
CrawlerLaunch
)
p
=
multiprocessing
.
Process
(
target
=
CrawlerLaunch
)
...
...
crawler_baidu.py
View file @
adcf2d90
...
@@ -5,8 +5,6 @@ import json
...
@@ -5,8 +5,6 @@ import json
import
datetime
import
datetime
import
crawler
import
crawler
#爬虫启动
#爬虫启动
def
CrawlerBaidu
():
def
CrawlerBaidu
():
with
sync_playwright
()
as
playwright
:
with
sync_playwright
()
as
playwright
:
...
@@ -25,14 +23,9 @@ def GetBaiduCompany(playwright: Playwright) -> None:
...
@@ -25,14 +23,9 @@ def GetBaiduCompany(playwright: Playwright) -> None:
intro
=
re
.
sub
(
r'\[[\d-]+\]'
,
''
,
str
(
all
))
intro
=
re
.
sub
(
r'\[[\d-]+\]'
,
''
,
str
(
all
))
name2
=
crawler
.
SaveCompanyData
(
name
,
1
,
intro
)
name2
=
crawler
.
SaveCompanyData
(
name
,
1
,
intro
)
if
name2
!=
""
and
intro
!=
None
:
if
name2
!=
""
and
intro
!=
None
:
file
=
open
(
str
(
datetime
.
date
.
today
())
+
".txt"
,
"a"
)
crawler
.
Log
(
name2
+
":写入成功"
)
file
.
write
(
name2
+
":写入成功
\n
"
)
file
.
close
()
else
:
else
:
file
=
open
(
"example.txt"
,
"a"
)
crawler
.
Log
(
name2
+
":写入失败"
)
file
.
write
(
name2
+
":写入失败
\n
"
)
file
.
close
()
# ---------------------
# ---------------------
context
.
close
()
context
.
close
()
...
...
crawler_boss.py
View file @
adcf2d90
...
@@ -37,7 +37,7 @@ def spider_company(page):
...
@@ -37,7 +37,7 @@ def spider_company(page):
name2
=
crawler
.
SaveCompanyData
(
company_nameO
,
1
,
json
.
dumps
(
my_obj
))
name2
=
crawler
.
SaveCompanyData
(
company_nameO
,
1
,
json
.
dumps
(
my_obj
))
def
GetBossCompany
(
p
:
Playwright
)
->
None
:
def
GetBossCompany
(
p
:
Playwright
)
->
None
:
browser
=
p
.
chromium
.
launch
(
headless
=
Fals
e
)
browser
=
p
.
chromium
.
launch
(
headless
=
Tru
e
)
context
=
browser
.
new_context
()
context
=
browser
.
new_context
()
js
=
"""
js
=
"""
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment