斜体部分为本人的理解
一
线性回归(LinearRegression),即模型服从最小二乘y=ax+b,机器学习就是通过损失函数loss去拟合合适的值a,b
这个简单的线性回归模型是, y=bx+e,其中e服从正态分布(/0,sigma^2*i), 在统计学里这个相似于 y - bX ~ N(0, sigma^2 * I), y | X, b ~ N(bX, sigma^2 * I), 这个模型的损失函数类似于预测值与真实值的方差损失函数, 这个模型的变量b可以通过下式计算 b = (X^T X)^{-1} X^T y, 这个公式又被称为伪逆矩阵
代码
class LinearRegression:
"""
The simple linear regression model is
y = bX + e where e ~ N(0, sigma^2 * I)
In probabilistic terms this corresponds to
y - bX ~ N(0, sigma^2 * I)
y | X, b ~ N(bX, sigma^2 * I)
The loss for the model is simply the squared error between the model
predictions and the true values:
Loss = ||y - bX||^2
The MLE for the model parameters b can be computed in closed form via the
normal equation
b = (X^T X)^{-1} X^T y
where (X^T X)^{-1} X^T is known as the pseudoinverse / Moore-Penrose
inverse.
"""
def __init__(self, fit_intercept=True):
self.beta = None
self.fit_intercept = fit_intercept
def fit(self, X, y):
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]#矩阵拼接
pseudo_inverse = np.dot(np.linalg.inv(np.dot(X.T, X)), X.T)#求解伪逆矩阵
self.beta = np.dot(pseudo_inverse, y)#求解b即beta为b
def predict(self, X):
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
return np.dot(X, self.beta)#预测y=bx
二 岭回归(RidgeRegression)
与LinearRegression相比,在损失函数中添加了一项alpha,拟合更加精确
岭回归使用了与简单线性回归相同的模型,但是为损失函数添加了额外对系数L2的惩罚,这有时也被叫做吉洪诺夫正则化,一般来说,这个模型也符合简单的模型
y = bX + e where e ~ N(0, sigma^2 * I)
,但它的损失函数由下面的公式计算
RidgeLoss = ||y - bX||^2 + alpha * ||b||^2
在ML中这个模型b可以被计算通过一个调整后的式子
b = (X^T X + alpha I)^{-1} X^T y
其中(X^T X + alpha I)^{-1} X^T是被L2系数调整后的伪逆矩阵
原文
class RidgeRegression:
""" Ridge regression uses the same simple linear regression model but adds an additional penalty on the L2-norm of the coefficients to the loss function.
This is sometimes known as Tikhonov regularization.
In particular, the ridge model is still simply
y = bX + e where e ~ N(0, sigma^2 * I)
except now the error for the model is calcualted as
RidgeLoss = ||y - bX||^2 + alpha * ||b||^2
The MLE for the model parameters b can be computed in closed form via the adjusted normal equation:
b = (X^T X + alpha I)^{-1} X^T y
where (X^T X + alpha I)^{-1} X^T is the pseudoinverse / Moore-Penrose inverse adjusted for the L2 penalty on the model coefficients.
"""
def __init__(self, alpha=1, fit_intercept=True):
""" A ridge regression model fit via the normal equation. 岭回归服从正态分布
Parameters(参数) ---------- alpha : float (default: 1) L2 regularization coefficient. Higher values correspond to larger penalty on the l2 norm of the model coefficients
L2系数惩罚项,较高的值对应较大的模型系数l2范数的惩罚
fit_intercept : bool (default: True) Whether to fit an additional intercept term in addition to the model coefficients (系数是否符合截距???) """
self.beta = None
self.alpha = alpha
self.fit_intercept = fit_intercept
def fit(self, X, y): # convert X to a design matrix if we're fitting an intercept if self.fit_intercept: X = np.c_[np.ones(X.shape[0]), X]
A = self.alpha * np.eye(X.shape[1])
pseudo_inverse = np.dot(np.linalg.inv(X.T @ X + A), X.T)
self.beta = pseudo_inverse @ y
def predict(self, X): # convert X to a design matrix if we're fitting an intercept
if self.fit_intercept: X = np.c_[np.ones(X.shape[0]), X]
return np.dot(X, self.beta)
sigmod函数
def sigmoid(x):
return 1 / (1 + np.exp(-x))
三逻辑回归(LogisticRegression)
使用线性模型为每一特征赋一个值再代入sigmod函数判断在哪个部分
class LogisticRegression:
def __init__(self, penalty="l2", gamma=0, fit_intercept=True):
"""
A simple logistic regression model fit via gradient descent on the
penalized negative log likelihood.
一个简单的逻辑回归模型符合梯度下降的负对数概率惩罚
Parameters(变量)
----------
penalty : str (default: 'l2')
The type of regularization penalty to apply on the coefficients
`beta`. Valid entries are {'l2', 'l1'}.
penalty是一个应用于系数“beta”的正则化惩罚,默认12
gamma : float in [0, 1] (default: 0)
The regularization weight. Larger values correspond to larger
regularization penalties, and a value of 0 indicates no penalty.
gamme:正则化权值,大的值意味着大的penalty,而0代表没有惩罚
fit_intercept : bool (default: True)
Whether to fit an intercept term in addition to the coefficients in
b. If True, the estimates for `beta` will have M+1 dimensions,
where the first dimension corresponds to the intercept
截距是否符合系数b,如果是该变量为true,,估计beta有m+1个维度其中第一个维度与截距相关
"""
err_msg = "penalty must be 'l1' or 'l2', but got: {}".format(penalty)
assert penalty in ["l2", "l1"], err_msg
self.beta = None
self.gamma = gamma
self.penalty = penalty
self.fit_intercept = fit_intercept
def fit(self, X, y, lr=0.01, tol=1e-7, max_iter=1e7):
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
l_prev = np.inf#预设为无穷大
self.beta = np.random.rand(X.shape[1])
for _ in range(int(max_iter)):
y_pred = sigmoid(np.dot(X, self.beta))
loss = self._NLL(X, y, y_pred)
if l_prev - loss < tol:
return
l_prev = loss
self.beta -= lr * self._NLL_grad(X, y, y_pred)
def _NLL(self, X, y, y_pred):
"""
Penalized negative log likelihood of the targets under the current
model.
当前目标负对数概率的惩罚模型。
NLL = -1/N * (
[sum_{i=0}^N y_i log(y_pred_i) + (1-y_i) log(1-y_pred_i)] -
(gamma ||b||) / 2
)
"""
N, M = X.shape
order = 2 if self.penalty == "l2" else 1
nll = -np.log(y_pred[y == 1]).sum() - np.log(1 - y_pred[y == 0]).sum()
penalty = 0.5 * self.gamma * np.linalg.norm(self.beta, ord=order)
return (penalty + nll) / N
def _NLL_grad(self, X, y, y_pred):
""" Gradient of the penalized negative log likelihood wrt beta """
N, M = X.shape
p = self.penalty
beta = self.beta
gamma = self.gamma
d_penalty = gamma * beta if p == "l2" else gamma * np.sign(beta)
return -(np.dot(y - y_pred, X) + d_penalty) / N
def predict(self, X):
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
return sigmoid(np.dot(X, self.beta))
2019.7.10更新
四贝叶斯回归
还有两个算法是贝叶斯回归,看不太懂了,先放上
class BayesianLinearRegressionUnknownVariance:
"""
贝叶斯回归
Bayesian Linear Regression
--------------------------
In its general form, Bayesian linear regression extends the simple linear
regression model by introducing priors on model parameters b and/or the
error variance sigma^2.
常规来看,贝叶斯线性回归通过引入先验的模型参数b或者损失方差sigma^2来拓展简单线性回归模型
The introduction of a prior allows us to quantify the uncertainty in our
parameter estimates for b by replacing the MLE point estimate in simple
linear regression with an entire posterior *distribution*, p(b | X, y,
sigma), simply by applying Bayes rule:
先验的引入使我们能够量化用简单的MLE点估计代替B的参数估计具有完整后验分布的线性回归,p(b_x,y,sigma),只需应用bayes规则:
p(b | X, y) = [ p(y | X, b) * p(b | sigma) ] / p(y | X)
We can also quantify the uncertainty in our predictions y* for some new
data X* with the posterior predictive distribution:
我们也可以用我们预测的y*和一些新的x*预测分布来后验我们不确定的参数:
p(y* | X*, X, Y) = \int_{b} p(y* | X*, b) p(b | X, y) db
Depending on the choice of prior it may be impossible to compute an
analytic form for the posterior / posterior predictive distribution. In
these cases, it is common to use approximations, either via MCMC or
variational inference.
根据先验的选择,可能无法计算后/后预测分布的分析形式。在这些情况下,通常通过MCMC或变分推理。
Bayesian Regression w/ unknown variance
贝叶斯未知变化范围的回归
---------------------------------------
If *both* b and the error variance sigma^2 are unknown, the conjugate prior
for the Gaussian likelihood is the Normal-Gamma distribution (univariate
likelihood) or the Normal-Inverse-Wishart distribution (multivariate
likelihood).
如果*b和误差方差sigma^2都未知,则共轭先验因为高斯似然是正态伽马分布(单变量似然)或威沙特分布(多变量可能性)。
Univariate:
b, sigma^2 ~ NG(b_mean, b_V, alpha, beta)
sigma^2 ~ InverseGamma(alpha, beta)
b | sigma^2 ~ N(b_mean, sigma^2 * b_V)
where alpha, beta, b_V, and b_mean are parameters of the prior.
Multivariate:
b, Sigma ~ NIW(b_mean, lambda, Psi, rho)
Sigma ~ N(b_mean, 1/lambda * Sigma)
b | Sigma ~ W^{-1}(Psi, rho)
where b_mean, lambda, Psi, and rho are parameters of the prior.
Due to the conjugacy of the above priors with the Gaussian likelihood of
the linear regression model we can compute the posterior distributions for
the model parameters in closed form:
根据上述先验对线性模型的高斯似然的共轭我们可以在闭合域内计算模型后验的下述变量
B = (y - X b_mean)
shape = N + alpha
scale = (1 / shape) * {alpha * beta + B^T ([X b_V X^T + I])^{-1} B}
sigma^2 | X, y ~ InverseGamma(shape, scale)
A = (b_V^{-1} + X^T X)^{-1}
mu_b = A b_V^{-1} b_mean + A X^T y
cov_b = sigma^2 A
b | X, y, sigma^2 ~ N(mu_b, cov_b)
This allows us a closed form for the posterior predictive distribution as
well:
这也使我们用一个闭合的形式验证后验分布(翻译不太懂。。。)
y* | X*, X, Y ~ N(X* mu_b, X* cov_b X*^T + I)
"""
def __init__(self, alpha=1, beta=2, b_mean=0, b_V=None, fit_intercept=True):
"""
Bayesian linear regression model with conjugate Normal-Gamma prior on b
and sigma^2
贝叶斯回归有共轭normal-gamma先验参数b和sigma2,服从下列分布
b, sigma^2 ~ NG(b_mean, b_V, alpha, beta)
sigma^2 ~ InverseGamma(alpha, beta)
b ~ N(b_mean, sigma^2 * b_V)
Parameters
----------
alpha : float (default: 1)
The shape parameter for the Inverse-Gamma prior on sigma^2. Must be
strictly greater than 0.
beta : float (default: 1)
The scale parameter for the Inverse-Gamma prior on sigma^2. Must be
strictly greater than 0.
b_mean : np.array of shape (M,) or float (default: 0)
The mean of the Gaussian prior on b. If a float, assume b_mean is
np.ones(M) * b_mean.
b_V : np.array of shape (N, N) or np.array of shape (N,) or None
A symmetric positive definite matrix that when multiplied
element-wise by b_sigma^2 gives the covariance matrix for the
Gaussian prior on b. If a list, assume b_V=diag(b_V). If None,
assume b_V is the identity matrix.
fit_intercept : bool (default: True)
Whether to fit an intercept term in addition to the coefficients in
b. If True, the estimates for b will have M+1 dimensions, where
the first dimension corresponds to the intercept
"""
# this is a placeholder until we know the dimensions of X
b_V = 1.0 if b_V is None else b_V
if isinstance(b_V, list):
b_V = np.array(b_V)
if isinstance(b_V, np.ndarray):
if b_V.ndim == 1:
b_V = np.diag(b_V)
elif b_V.ndim == 2:
assert is_symmetric_positive_definite(
b_V
), "b_V must be symmetric positive definite"
self.b_V = b_V
self.beta = beta
self.alpha = alpha
self.b_mean = b_mean
self.fit_intercept = fit_intercept
self.posterior = {"mu": None, "cov": None}
self.posterior_predictive = {"mu": None, "cov": None}
def fit(self, X, y):
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
N, M = X.shape
beta = self.beta
self.X, self.y = X, y
if is_number(self.b_V):
self.b_V *= np.eye(M)
if is_number(self.b_mean):
self.b_mean *= np.ones(M)
# sigma
I = np.eye(N)
a = y - np.dot(X, self.b_mean)
b = np.linalg.inv(np.dot(X, self.b_V).dot(X.T) + I)
c = y - np.dot(X, self.b_mean)
shape = N + self.alpha
sigma = (1 / shape) * (self.alpha * beta ** 2 + np.dot(a, b).dot(c))
scale = sigma ** 2
# b_sigma is the mode of the inverse gamma prior on sigma^2
b_sigma = scale / (shape - 1)
# mean
b_V_inv = np.linalg.inv(self.b_V)
l = np.linalg.inv(b_V_inv + np.dot(X.T, X))
r = np.dot(b_V_inv, self.b_mean) + np.dot(X.T, y)
mu = np.dot(l, r)
cov = l * b_sigma
# posterior distribution for sigma^2 and c
self.posterior = {
"sigma**2": {"dist": "InvGamma", "shape": shape, "scale": scale},
"b | sigma**2": {"dist": "Gaussian", "mu": mu, "cov": cov},
}
def predict(self, X):
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
I = np.eye(X.shape[0])
mu = np.dot(X, self.posterior["b | sigma**2"]["mu"])
cov = np.dot(X, self.posterior["b | sigma**2"]["cov"]).dot(X.T) + I
# MAP estimate for y corresponds to the mean of the posterior
# predictive
self.posterior_predictive["mu"] = mu
self.posterior_predictive["cov"] = cov
return mu
class BayesianLinearRegressionKnownVariance:
"""
Bayesian Linear Regression
--------------------------
In its general form, Bayesian linear regression extends the simple linear
regression model by introducing priors on model parameters b and/or the
error variance sigma^2.
The introduction of a prior allows us to quantify the uncertainty in our
parameter estimates for b by replacing the MLE point estimate in simple
linear regression with an entire posterior *distribution*, p(b | X, y,
sigma), simply by applying Bayes rule:
p(b | X, y) = [ p(y | X, b) * p(b | sigma) ] / p(y | X)
We can also quantify the uncertainty in our predictions y* for some new
data X* with the posterior predictive distribution:
p(y* | X*, X, Y) = \int_{b} p(y* | X*, b) p(b | X, y) db
Depending on the choice of prior it may be impossible to compute an
analytic form for the posterior / posterior predictive distribution. In
these cases, it is common to use approximations, either via MCMC or
variational inference.
此段和上面一样
Bayesian linear regression with known variance
----------------------------------------------
If we happen to already know the error variance sigma^2, the conjugate
prior on b is Gaussian. A common parameterization is:
如果我们已知损失方差sigma2,那么b的共轭先验是高斯化的,一个普通的关系是:
b | sigma, b_V ~ N(b_mean, sigma^2 * b_V)
where b_mean, sigma and b_V are hyperparameters. Ridge regression is a
special case of this model where b_mean = 0, sigma = 1 and b_V = I (ie.,
the prior on b is a zero-mean, unit covariance Gaussian).
Due to the conjugacy of the above prior with the Gaussian likelihood in the
linear regression model, we can compute the posterior distribution over the
model parameters in closed form:
A = (b_V^{-1} + X^T X)^{-1}
mu_b = A b_V^{-1} b_mean + A X^T y
cov_b = sigma^2 A
b | X, y ~ N(mu_b, cov_b)
which allows us a closed form for the posterior predictive distribution as
well:
y* | X*, X, Y ~ N(X* mu_b, X* cov_b X*^T + I)
"""
def __init__(self, b_mean=0, b_sigma=1, b_V=None, fit_intercept=True):
"""
Bayesian linear regression model with known error variance and
conjugate Gaussian prior on b
b | b_mean, sigma^2, b_V ~ N(b_mean, sigma^2 * b_V)
Ridge regression is a special case of this model where b_mean = 0,
sigma = 1 and b_V = I (ie., the prior on b is a zero-mean, unit
covariance Gaussian).
Parameters
----------
b_mean : np.array of shape (M,) or float (default: 0)
The mean of the Gaussian prior on b. If a float, assume b_mean is
np.ones(M) * b_mean.
b_sigma : float (default: 1)
A scaling term for covariance of the Gaussian prior on b
b_V : np.array of shape (N,N) or np.array of shape (N,) or None
A symmetric positive definite matrix that when multiplied
element-wise by b_sigma^2 gives the covariance matrix for the
Gaussian prior on b. If a list, assume b_V=diag(b_V). If None,
assume b_V is the identity matrix.
fit_intercept : bool (default: True)
Whether to fit an intercept term in addition to the coefficients in
b. If True, the estimates for b will have M+1 dimensions, where
the first dimension corresponds to the intercept
"""
# this is a placeholder until we know the dimensions of X
b_V = 1.0 if b_V is None else b_V
if isinstance(b_V, list):
b_V = np.array(b_V)
if isinstance(b_V, np.ndarray):
if b_V.ndim == 1:
b_V = np.diag(b_V)
elif b_V.ndim == 2:
assert is_symmetric_positive_definite(
b_V
), "b_V must be symmetric positive definite"
self.posterior = {}
self.posterior_predictive = {}
self.b_V = b_V
self.b_mean = b_mean
self.b_sigma = b_sigma
self.fit_intercept = fit_intercept
def fit(self, X, y):
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
N, M = X.shape
self.X, self.y = X, y
if is_number(self.b_V):
self.b_V *= np.eye(M)
if is_number(self.b_mean):
self.b_mean *= np.ones(M)
b_V = self.b_V
b_mean = self.b_mean
b_sigma = self.b_sigma
b_V_inv = np.linalg.inv(b_V)
l = np.linalg.inv(b_V_inv + np.dot(X.T, X))
r = np.dot(b_V_inv, b_mean) + np.dot(X.T, y)
mu = np.dot(l, r)
cov = l * b_sigma ** 2
# posterior distribution over b conditioned on b_sigma
self.posterior["b"] = {"dist": "Gaussian", "mu": mu, "cov": cov}
def predict(self, X):
# convert X to a design matrix if we're fitting an intercept
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
I = np.eye(X.shape[0])
mu = np.dot(X, self.posterior["b"]["mu"])
cov = np.dot(X, self.posterior["b"]["cov"]).dot(X.T) + I
# MAP estimate for y corresponds to the mean of the posterior
# predictive distribution
self.posterior_predictive = {"dist": "Gaussian", "mu": mu, "cov": cov}
return mu