Crawler Demo 03

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re


html = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode('utf-8')
print(html)
print("--------")
# <!DOCTYPE html>
# <html lang="cn">
# <head>
#   <meta charset="UTF-8">
#   <title>爬虫练习 表格 table | 莫烦 Python</title>
# 
#   <style>
#   img {
#       width: 250px;
#   }
#   table{
#       width:50%;
#   }
#   td{
#       margin:10px;
#       padding:15px;
#   }
#   </style>
# </head>
# <body>
# 
# <h1>表格 爬虫练习</h1>
# 
# <p>这是一个在 <a href="https://morvanzhou.github.io/" >莫烦 Python</a> 的 <a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/" >爬虫教程</a>
#   里无敌简单的网页, 所有的 code 让你一目了然, 清晰无比.</p>
# 
# <br>
# <table id="course-list">
#   <tr>
#       <th>
#           分类
#       </th><th>
#           名字
#       </th><th>
#           时长
#       </th><th>
#           预览
#       </th>
#   </tr>
# 
#   <tr id="course1" class="ml">
#       <td>
#           机器学习
#       </td><td>
#           <a href="https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/">
#               Tensorflow 神经网络</a>
#       </td><td>
#           2:00
#       </td><td>
#           <img src="https://morvanzhou.github.io/static/img/course_cover/tf.jpg">
#       </td>
#   </tr>
# 
#   <tr id="course2" class="ml">
#       <td>
#           机器学习
#       </td><td>
#           <a href="https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/">
#               强化学习</a>
#       </td><td>
#           5:00
#       </td><td>
#           <img src="https://morvanzhou.github.io/static/img/course_cover/rl.jpg">
#       </td>
#   </tr>
# 
#   <tr id="course3" class="data">
#       <td>
#           数据处理
#       </td><td>
#           <a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">
#               爬虫</a>
#       </td><td>
#           3:00
#       </td><td>
#           <img src="https://morvanzhou.github.io/static/img/course_cover/scraping.jpg">
#       </td>
#   </tr>
# 
# </table>
# 
# </body>
# </html>

soup = BeautifulSoup(html, features='lxml')
img_links = soup.find_all("img", {"src": re.compile('.*\.jpg')})
for link in img_links:
    print(link)
    print(link['src'])

# <img src="https://morvanzhou.github.io/static/img/course_cover/tf.jpg"/>
# https://morvanzhou.github.io/static/img/course_cover/tf.jpg
# <img src="https://morvanzhou.github.io/static/img/course_cover/rl.jpg"/>
# https://morvanzhou.github.io/static/img/course_cover/rl.jpg
# <img src="https://morvanzhou.github.io/static/img/course_cover/scraping.jpg"/>
# https://morvanzhou.github.io/static/img/course_cover/scraping.jpg

print("--------")
courses_link = soup.find_all('a', {'href': re.compile('https://morvan.')})
for link in courses_link:
    print(link['href'])
# https://morvanzhou.github.io/
# https://morvanzhou.github.io/tutorials/data-manipulation/scraping/
# https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/
# https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
# https://morvanzhou.github.io/tutorials/data-manipulation/scraping/
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容