from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
html = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode('utf-8')
print(html)
print("--------")
# <!DOCTYPE html>
# <html lang="cn">
# <head>
# <meta charset="UTF-8">
# <title>爬虫练习 表格 table | 莫烦 Python</title>
#
# <style>
# img {
# width: 250px;
# }
# table{
# width:50%;
# }
# td{
# margin:10px;
# padding:15px;
# }
# </style>
# </head>
# <body>
#
# <h1>表格 爬虫练习</h1>
#
# <p>这是一个在 <a href="https://morvanzhou.github.io/" >莫烦 Python</a> 的 <a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/" >爬虫教程</a>
# 里无敌简单的网页, 所有的 code 让你一目了然, 清晰无比.</p>
#
# <br>
# <table id="course-list">
# <tr>
# <th>
# 分类
# </th><th>
# 名字
# </th><th>
# 时长
# </th><th>
# 预览
# </th>
# </tr>
#
# <tr id="course1" class="ml">
# <td>
# 机器学习
# </td><td>
# <a href="https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/">
# Tensorflow 神经网络</a>
# </td><td>
# 2:00
# </td><td>
# <img src="https://morvanzhou.github.io/static/img/course_cover/tf.jpg">
# </td>
# </tr>
#
# <tr id="course2" class="ml">
# <td>
# 机器学习
# </td><td>
# <a href="https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/">
# 强化学习</a>
# </td><td>
# 5:00
# </td><td>
# <img src="https://morvanzhou.github.io/static/img/course_cover/rl.jpg">
# </td>
# </tr>
#
# <tr id="course3" class="data">
# <td>
# 数据处理
# </td><td>
# <a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">
# 爬虫</a>
# </td><td>
# 3:00
# </td><td>
# <img src="https://morvanzhou.github.io/static/img/course_cover/scraping.jpg">
# </td>
# </tr>
#
# </table>
#
# </body>
# </html>
soup = BeautifulSoup(html, features='lxml')
img_links = soup.find_all("img", {"src": re.compile('.*\.jpg')})
for link in img_links:
print(link)
print(link['src'])
# <img src="https://morvanzhou.github.io/static/img/course_cover/tf.jpg"/>
# https://morvanzhou.github.io/static/img/course_cover/tf.jpg
# <img src="https://morvanzhou.github.io/static/img/course_cover/rl.jpg"/>
# https://morvanzhou.github.io/static/img/course_cover/rl.jpg
# <img src="https://morvanzhou.github.io/static/img/course_cover/scraping.jpg"/>
# https://morvanzhou.github.io/static/img/course_cover/scraping.jpg
print("--------")
courses_link = soup.find_all('a', {'href': re.compile('https://morvan.')})
for link in courses_link:
print(link['href'])
# https://morvanzhou.github.io/
# https://morvanzhou.github.io/tutorials/data-manipulation/scraping/
# https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/
# https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
# https://morvanzhou.github.io/tutorials/data-manipulation/scraping/
Crawler Demo 03
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。