爬虫代码——爬取简书首页前10文章
frombs4importBeautifulSoupfromurllib.requestimporturlopen,Request# 配置 User-Agent 并创建 Request 对象headers={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'}req=Request(url='https://www.jianshu.com/',headers=headers)# 爬取网页bs=BeautifulSoup(urlopen(req).read(),'html.parser')all_article=bs.find_all('a',{'class':'title','target':'_blank'})# 显示数据forarticleinall_article:print('=====文章=====')print('标题:{}'.format(article.get_text()))print('链接: {}'.format('https://www.jianshu.com'+article.attrs['href']))print(article.parent.p.get_text()[7:])# 每个描述字符串前都有 7 个空 格,需要截断print('============\n')