Python操作

0. Python特性

0.1 Python中的包、模块以及import时注意事项
包：包含init文件的文件夹
模块：包文件夹下的.py文件
import：
引入包中模块：import packname.modulename
引入多个模块中多个变量：from modulename import (a, b, …)

0.2 Python37特性转自https://www.cnblogs.com/Li-JT/p/15376165.html
Python 3.7版本，函数的参数可以通过冒号来进行注释

def f(ham: str, eggs: str = 'eggs') -> str :
    print("Annotations:", f.__annotations__)
    print("Arguments:", ham, eggs)
    return ham + ' and ' + eggs

代码中 str 都是注释，不是强制确定的类型（Python是动态类型的）
冒号后表示参数的注释，如果需要默认赋值再在后面加等号即可。
箭头后表示返回值的注释。

1. 画图表

利用matplotlib库
1）折线图

import matplotlib.pyplot as plt

# 绘制折线图
# plt.plot(x, y, format_string) 此处 'b':蓝色，'-.':点划线，'o':实心圈标记
plt.plot([i-10 for i in range(20)], [i for i in range(20)], 'b-.o', label='blue')

plt.xlabel('x')
plt.ylabel('y')
plt.title('Demo')
# 设置左上角线条指示
plt.legend()
# savefig()要放到show()之前才有结果
# 直接加后缀就可以存为png, jpg, svg...
plt.savefig('demo.png')
plt.show()

demo.png

2）并列柱状图

import numpy as np
import matplotlib.pyplot as plt

size = 5
x = np.arange(size)
a = np.random.random(size)
b = np.random.random(size)
c = np.random.random(size)

total_width, n = 0.8, 3 # 三个柱子总宽
width = total_width / n # 一个柱子宽
x = x - (total_width - width) / 2 # 第一个柱子的中心（x坐标）W
# plt.bar(x,y, width)
plt.bar(x, a,  width=width, label='a')
plt.bar(x + width, b, width=width, label='b')
plt.bar(x + 2 * width, c, width=width, label='c')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

image.png

2. time库当前时间打印

import time

print(time.time()) # 当前时间 浮点数时间戳 
print(time.localtime()) # 当前时间 时间元组

tuple_time = time.localtime(time.time()) # 当前时间 时间戳==>时间元组
print(tuple_time) 

format_time = time.strftime("%Y-%m-%d %H:%M:%S", tuple_time) # 当前时间 时间元组==>格式化时间
print(format_time) 
print(time.strptime(format_time,"%Y-%m-%d %H:%M:%S"))
print(time.mktime(tuple_time)) # 当前时间 时间元组==>时间戳

结果：

1638512797.5577352
time.struct_time(tm_year=2021, tm_mon=12, tm_mday=3, tm_hour=14, tm_min=26, tm_sec=37, tm_wday=4, tm_yday=337, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=12, tm_mday=3, tm_hour=14, tm_min=26, tm_sec=37, tm_wday=4, tm_yday=337, tm_isdst=0)
2021-12-03 14:26:37
time.struct_time(tm_year=2021, tm_mon=12, tm_mday=3, tm_hour=14, tm_min=26, tm_sec=37, tm_wday=4, tm_yday=337, tm_isdst=-1)
1638512797.0

3. np.random模块

import numpy as np

# np.random.randint()生成随机整数
print(np.random.randint(10)) # [0, 10)-->7
print(np.random.randint(10,20)) # [10, 20)-->11
print(np.random.randint(10, size=10)) # [1 4 9 9 2 3 1 9 7 1]
print(np.random.randint(10, size=(2,3))) # [[4 0 7]
                                         #  [8 4 0]]
    
# np.random.randn()从标准正态分布中生成随机浮点数
print(np.random.randn(2,4)) # [[-2.54856518 -0.12214179 -1.87864935  0.43161977]
                             # [ 0.44264148  0.64066884  2.63730482 -0.89815009]]
print(np.random.normal(0, 1, size=5))  # 从均值为0，方差为1的正态分布（标准正态）中随机
# [ 0.68093563 -0.50425498 -1.22208785 -1.3887711   0.89877655]

# np.random.rand()从(0,1)中生成随机浮点数 
print(np.random.rand(2,4)) # [[0.4488376  0.76863046 0.32946063 0.31675537]
                            # [0.77070435 0.12285917 0.96088168 0.04157771]]
print(np.random.random((2,4))) # 和rand一样的效果

4. collections库使用

collections.counter是对dict的扩展，用于计数

import collections
# 对dict的扩展，用于计数
counter = collections.Counter("which") # 也可以传入list: ['w', 'h', 'i', 'c', 'h']
# 访问是与dict类似
for key, val in counter.items():
    print(key, val)
# 自动更新技术
counter.update("hhh")
for key, val in counter.items():
    print(key, val)
# 输出最常出现的k个元素，list形式
print("most_common: %s" % counter.most_common(1))

输出：
w 1
h 2
i 1
c 1
------
w 1
h 5
i 1
c 1
------
most_common: [('h', 5)]

collections.defaultdict
defaultdict会为不存在的键先初始化它的默认值（指定类型）

import collections
bag = ['a', 'o', 'c', 'a', 'a', 'c', 'b']
# 普通的dict()在第一次访问a的时候因为没有这个键会KeyError
# defaultdict相当于先用int来初始化值
# int: 0, str: 空串, list: [], set: set()
count = collections.defaultdict(int) 
for fruit in bag:
    count[fruit] += 1

输出：
defaultdict(<class 'int'>, {'a': 3, 'o': 1, 'c': 2, 'b': 1})

5. 正则表达式re库

对于* . ? + $ ^ [ ] ( ) { } | \ / 如果要匹配它们需要额外转义(注：只是英文的这些符号需要，中文的不需要）。
如 \\( 匹配 ( ，之所以要两次反斜杠是因为，一次过正则引擎，一次过python引擎。
当正则中没有()时候, group()方法也可以取整体匹配结果，同group(0)，有()时候索引号代表第几个元组匹配内容

正则语法	备注
[ ]	[ ]中的只以unigram匹配，每个字符之间是或关系，如：[az]匹配a或z，[\u4E00-\u9FA5]匹配汉字
\|	整个表达式首先以 \| 划分成若干个或关系，每个子表达式长度>1也无须加括号，如："好听\|棒"匹配“好听”或者“棒”
( )	分组，对match对象用group(index)方法访问第index个()匹配内容

import re

# re.match()只从字符串开头匹配，较少使用
# re.search()只匹配第一次出现，常用于查找有无某一类pattern
a =  re.search("(棒|好听|好哇)[了!的]+", line)
if a: # 可以匹配：棒了，好听!!! 等等
    print(a)

# 查找所有出现的模式
content = 'Hello, I am Jerry, from Chongqing, a montain city, nice to meet you……'
pattern = re.compile('\w*o\w*')
x = pattern.findall(content) # 返回list: ['Hello', 'from', ...]

# re.sub() 比 str.replace()要好用很多，主要在1.可以正则匹配；2.可以函数替换
s = 'B83C72D1D8E67' 
def convert(value):
    # value是多个match对象, value.group()获取一次匹配值, value.span()获取起始位置元组（左开右闭）
    # 当正则中没有()时候, group()方法也可以取整体匹配结果，同group(0)，有()时候索引号代表第几个元组匹配内容
    num = int(value.group())
    if num > 9: # 将两位数的数字都/10返回
        return str(int(num/10))
    else:
        return str(int(num/10))

s = re.sub('\d{1,2}', convert, s) # 相当于多次调用match函数
# s前: B83C72D1D8E67
# s后: B8C7D0D0E6

6. 简单爬取并解析页面

用requests爬取，用bs4解析，CSS选择器select

import requests
from bs4 import BeautifulSoup

def get_html(url): #爬取源码函数
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"
    }  # 模拟浏览器访问
    response = requests.get(url, headers=headers)  # 请求访问网站
    response.encoding = response.apparent_encoding #设置字符编码格式
    html = response.text  # 获取网页源码
    return html  # 返回网页源码

r = get_html("https://www.baidu.com")
soup = BeautifulSoup("<p class='button'>你好</p><p class='button'>你不好</p>", \
                      'lxml') # 设置lxml解析器
tag_soup = soup.select('.button') # 返回list，包括所有匹配的标签及内容
# 同 soup.select('p[class="button"]')
for tag in tag_soup:
    print(tag.text) # 你好 你不好

7. TF-IDF库使用

转自https://blog.csdn.net/weixin_44285715/article/details/105930874

import numpy as np
corpus = np.array(['The sun is shining',
                   'The weather is sweet',
                   'The sun is shining and the weather is sweet'])

from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer() # 输入list语料，输出每个单词的词频TF
tf=count.fit_transform(corpus)
# 查看词频输出
tf.toarray()
# array([[0, 1, 1, 1, 0, 1, 0],
#       [0, 1, 0, 0, 1, 1, 1],
#       [1, 2, 1, 1, 1, 2, 1]], dtype=int64)
# 查看词汇表
count.vocabulary_
# {'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}
# 'the':5 表示'the'这个单词的词频显示在词频向量中的第6位
# 第一个向量中为1，第二个向量中为 1，第三个向量中为 2，分别表示'the'这个单词在第一个文档中出现 1 次，在第二个文档中出现 1 次，在第三个文档中出现 2 次

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer() # 将词频TF作为输入，得到TF-IDF值
tf_idf=tfidf.fit_transform(tf) 
# 查看词频逆反文档频率输出
tf_idf.toarray()
# array([[0.  , 0.43, 0.56, 0.56, 0.  , 0.43, 0.  ],
#       [0.  , 0.43, 0.  , 0.  , 0.56, 0.43, 0.56],
#       [0.4 , 0.48, 0.31, 0.31, 0.31, 0.48, 0.31]])

或者把两步并为一步:
TfidfVectorizer().fit_transform(corpus) = TfidfTransformer().fit_transform(CountVectorizer().fit_transform(corpus))

from sklearn.feature_extraction.text import TfidfVectorizer
countfidf = TfidfVectorizer()
count_tfidf = countfidf.fit_transform(corpus)
# 查看输出
count_tfidf.toarray()
# array([[0.  , 0.43, 0.56, 0.56, 0.  , 0.43, 0.  ],
#       [0.  , 0.43, 0.  , 0.  , 0.56, 0.43, 0.56],
#       [0.4 , 0.48, 0.31, 0.31, 0.31, 0.48, 0.31]])

注意：使用中文文本时list中需要事先以空格隔开分词，否则模块不会分词。
如果对中文句子list(sentences)在CountVectorizer()中会默认会过滤掉长度为1的字符，需要修改参数token_pattern

Python操作

Python操作

0. Python特性

1. 画图表

2. time库当前时间打印

3. np.random模块

4. collections库使用

5. 正则表达式re库

6. 简单爬取并解析页面

7. TF-IDF库使用

相关阅读更多精彩内容

友情链接更多精彩内容