Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
crawler-py
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
jimmy蒋政彪
crawler-py
Commits
72e607c7
Commit
72e607c7
authored
Aug 29, 2023
by
jimmy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
提交任意时间
parent
adcf2d90
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
15 additions
and
16 deletions
+15
-16
vcs.xml
.idea/vcs.xml
+6
-0
crawler.cpython-311.pyc
__pycache__/crawler.cpython-311.pyc
+0
-0
crawler_baidu.cpython-311.pyc
__pycache__/crawler_baidu.cpython-311.pyc
+0
-0
crawler_boss.cpython-311.pyc
__pycache__/crawler_boss.cpython-311.pyc
+0
-0
crawler.py
crawler.py
+7
-7
crawler_baidu.py
crawler_baidu.py
+1
-1
crawler_boss.py
crawler_boss.py
+1
-8
No files found.
.idea/vcs.xml
0 → 100644
View file @
72e607c7
<?xml version="1.0" encoding="UTF-8"?>
<project
version=
"4"
>
<component
name=
"VcsDirectoryMappings"
>
<mapping
directory=
""
vcs=
"Git"
/>
</component>
</project>
\ No newline at end of file
__pycache__/crawler.cpython-311.pyc
View file @
72e607c7
No preview for this file type
__pycache__/crawler_baidu.cpython-311.pyc
View file @
72e607c7
No preview for this file type
__pycache__/crawler_boss.cpython-311.pyc
View file @
72e607c7
No preview for this file type
crawler.py
View file @
72e607c7
...
...
@@ -47,20 +47,20 @@ def SaveCompanyData(name,types,intro):
#启动爬虫
def
CrawlerLaunch
():
while
True
:
randomtime
=
random
.
randint
(
3
,
100
)
time
.
sleep
(
randomtime
)
now
=
datetime
.
datetime
.
now
()
print
(
str
(
now
)
+
":启动"
)
randomtime
=
random
.
randint
(
3
,
120
)
print
(
str
(
now
)
+
":启动等待时间"
+
str
(
randomtime
))
time
.
sleep
(
randomtime
)
crawler_baidu
.
CrawlerBaidu
()
#crawler_boss.CrawlerBoss()
# 在进程中执行的任务
def
Log
(
dataS
):
file
=
open
(
str
(
datetime
.
date
.
today
())
+
".txt"
,
"a+"
)
file
.
write
(
dataS
+
"
\n
"
)
file
.
close
()
with
open
(
str
(
datetime
.
date
.
today
())
+
".txt"
,
"a+"
)
as
f
:
f
.
write
(
dataS
+
"
\n
"
)
f
.
close
()
if
__name__
==
"__main__"
:
p
=
multiprocessing
.
Process
(
target
=
CrawlerLaunch
)
...
...
crawler_baidu.py
View file @
72e607c7
...
...
@@ -12,7 +12,7 @@ def CrawlerBaidu():
#爬取百度的数据
def
GetBaiduCompany
(
playwright
:
Playwright
)
->
None
:
browser
=
playwright
.
chromium
.
launch
(
headless
=
Tru
e
)
browser
=
playwright
.
chromium
.
launch
(
headless
=
Fals
e
)
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
})
page
=
context
.
new_page
()
#获取需要爬取数据的公司
...
...
crawler_boss.py
View file @
72e607c7
...
...
@@ -13,11 +13,6 @@ def spider_company(page):
company_detail_el_count
=
company_detail_el
.
count
()
print
(
"company_detail_el.count():"
,
company_detail_el_count
)
my_obj
=
{
'intro'
:
""
}
post_data
=
{
"name"
:
company_nameO
,
"content"
:
json
.
dumps
(
my_obj
),
"type"
:
2
}
if
company_detail_el_count
>
0
:
company_detail_el
.
first
.
click
()
page
.
wait_for_timeout
(
1000
)
...
...
@@ -38,14 +33,12 @@ def spider_company(page):
def
GetBossCompany
(
p
:
Playwright
)
->
None
:
browser
=
p
.
chromium
.
launch
(
headless
=
True
)
context
=
browser
.
new_context
()
context
=
browser
.
new_context
(
viewport
=
{
"width"
:
800
,
"height"
:
600
}
)
js
=
"""
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
page
=
context
.
new_page
()
page
.
add_init_script
(
js
)
# spider = BossSpider()
# spider.spider_company(page)
spider_company
(
page
)
context
.
close
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment