加载样本数据
from sklearn import datasets
digits = datasets.load_digits()
features = digits.data #x
target = digits.target #y
'''
load_boston
包含 503 个波士顿房价的观察值。这是一个用于研究回归算法的优质数据集。
load_iris
包含 150 个鸢尾花尺寸的观察值。这是一个用于研究分类算法的优质数据集。
load_digits
包含 1797 个手写数字图片的观察值。这是一个用于研究图像分类算法的优质数据
集。
'''
数据仿真
回归
from sklearn.datasets import make_regression #回归仿真
features, target, coefficients = make_regression(n_samples = 100, n_features = 3, n_informative = 3,
n_targets = 1, noise = 0.0, coef = True, random_state = 1)
分类
from sklearn.datasets import make_classification #分类仿真
features, target = make_classification(n_samples = 100, n_features = 3, n_informative = 3, n_redundant = 0,
n_classes = 2, weights = [.25, .75], random_state = 1)
聚类
from sklearn.datasets import make_blobs #聚类
features, target = make_blobs(n_samples = 100, n_features = 2, centers = 3, cluster_std = 0.5, shuffle = True,
random_state = 1)
读取数据
csv
dataframe = pd.read_csv(url)
excel
dataframe = pd.read_excel(url, sheetname=0, header=1) #sheetname=[0,1,2, "Monthly Sales"]
json
dataframe = pd.read_json(url, orient='columns') #orient 文件结构
#json_normalize,它能将半结构化的 JSON数据转换为 pandas 的 DataFrame 类型
SQL
import pandas as pd
from sqlalchemy import create_engine
database_connection = create_engine('sqlite:///sample.db')
dataframe = pd.read_sql_query('SELECT * FROM data', database_connection)