# 线性回归模型使用技巧

## 核心理论知识

1. 模型假设：线性回归假设因变量与自变量之间存在线性关系，即y = β0 + β1x1 + β2x2 + ... + βnxn + ε，其中y是因变量，x是自变量，β是权重系数，ε是随机误差项。
2. 最小二乘法：线性回归的目标是找到一组权重，使所有数据点到直线的垂直距离（残差）的平方和最小，也就是最小化损失函数（均方误差）。
3. 系数估计：使用梯度下降法或正规方程（当自变量个数较少时）来求解最小化问题，得到最佳的权重β

### 1. 多项式特征

from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2)
X_poly = poly_features.fit_transform(X_train)

# 使用多项式特征重新训练模型
model_poly = LinearRegression()
model_poly.fit(X_poly, y_train)

# 预测并评估
y_pred_poly = model_poly.predict(poly_features.transform(X_test))
mse_poly = mean_squared_error(y_test, y_pred_poly)
print(f"Mean Squared Error with Polynomials: {mse_poly}")


### 2. 正则化

from sklearn.linear_model import Lasso, Ridge

# 使用Lasso正则化
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print(f"Mean Squared Error with Lasso: {mse_lasso}")

# 使用Ridge正则化
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(f"Mean Squared Error with Ridge: {mse_ridge}")


### 3. 特征选择

from sklearn.feature_selection import SelectKBest, f_regression

# 选择最重要的k个特征
selector = SelectKBest(score_func=f_regression, k=2)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# 使用选定的特征训练和评估模型
model_kbest = LinearRegression()
model_kbest.fit(X_train_selected, y_train)
y_pred_kbest = model_kbest.predict(X_test_selected)
mse_kbest = mean_squared_error(y_test, y_pred_kbest)
print(f"Mean Squared Error with KBest Features: {mse_kbest}")


### 4. 超参数调优

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# 对Ridge模型进行参数调优
ridge_params = {

'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}
ridge_search = GridSearchCV(Ridge(), ridge_params, scoring='neg_mean_squared_error', cv=5)
ridge_search.fit(X_train, y_train)
best_ridge = ridge_search.best_estimator_
y_pred_tuned = best_ridge.predict(X_test)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
print(f"Mean Squared Error with Tuned Ridge: {mse_tuned}")


### 5. 分组特征

from sklearn.model_selection import GroupKFold

# 假设我们有group_id变量表示数据的分组
groups = ...  # 填充实际的分组ID

# 使用GroupKFold进行交叉验证
gkf = GroupKFold(n_splits=5)
mse_list = []
for train_idx, test_idx in gkf.split(X, y, groups=groups):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse_list.append(mean_squared_error(y_test, y_pred))

mean_mse_group = np.mean(mse_list)
print(f"Mean Squared Error with GroupKFold: {mean_mse_group}")


### 6. 处理缺失值

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

model = LinearRegression()
model.fit(X_train_imputed, y_train)
y_pred_imputed = model.predict(X_test_imputed)
mse_imputed = mean_squared_error(y_test, y_pred_imputed)
print(f"Mean Squared Error with Imputed Data: {mse_imputed}")


### 7. 集成方法

from sklearn.ensemble import BaggingRegressor

bagging = BaggingRegressor(base_estimator=LinearRegression(), n_estimators=10, random_state=42)
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)
mse_bagging = mean_squared_error(y_test, y_pred_bagging)
print(f"Mean Squared Error with Bagging: {mse_bagging}")


### 8. 预处理和特征缩放

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_scaled = LinearRegression()
model_scaled.fit(X_train_scaled, y_train)
y_pred_scaled = model_scaled.predict(X_test_scaled)
mse_scaled = mean_squared_error(y_test, y_pred_scaled)
print(f"Mean Squared Error with Scaled Data: {mse_scaled}")


### 9. 岭回归和弹性网络

from sklearn.linear_model import Ridge, ElasticNet

# 岭回归
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(f"Mean Squared Error with Ridge Regression: {mse_ridge}")

# 弹性网络
elastic_model = ElasticNet(alpha=1.0, l1_ratio=0.5)
elastic_model.fit(X_train, y_train)
y_pred_elastic = elastic_model.predict(X_test)
mse_elastic = mean_squared_error(y_test, y_pred_elastic)
print(f"Mean Squared Error with Elastic Net: {mse_elastic}")


### 10. 鲁棒回归

from sklearn.linear_model import RANSACRegressor

ransac_model = RANSACRegressor(random_state=42)
ransac_model.fit(X_train, y_train)
y_pred_ransac = ransac_model.predict(X_test)
mse_ransac = mean_squared_error(y_test, y_pred_ransac)
print(f"Mean Squared Error with RANSAC Regression: {mse_ransac}")


### 11. 高斯过程回归

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

kernel = RBF(length_scale=1.0)
gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.1, random_state=42)
gpr.fit(X_train, y_train)
y_pred_gpr = gpr.predict(X_test)
mse_gpr = mean_squared_error(y_test, y_pred_gpr)
print(f"Mean Squared Error with Gaussian Process Regression: {mse_gpr}")


### 12. 模型融合

from sklearn.ensemble import VotingRegressor

# 创建多个回归器
regressors = [
('lr', LinearRegression()),
('ridge', Ridge()),
('elastic', ElasticNet()),
('ransac', RANSACRegressor()),
]

# 组合模型
ensemble = VotingRegressor(estimators=regressors, voting='hard')  # hard voting for regression
ensemble.fit(X_train, y_train)
y_pred_ensemble = ensemble.predict(X_test)
mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
print(f"Mean Squared Error with Ensemble Regression: {mse_ensemble}")


### 13. 大规模数据处理

• 在线学习：使用SGDRegressor（随机梯度下降回归器），它允许模型在数据流上逐步学习，非常适合大型数据集。
  from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)
sgd_reg.fit(X_train, y_train)
y_pred_sgd = sgd_reg.predict(X_test)
mse_sgd = mean_squared_error(y_test, y_pred_sgd)
print(f"Mean Squared Error with SGD Regression: {mse_sgd}")


### 14. 特征重要性分析

from sklearn.inspection import permutation_importance

# 使用之前训练好的模型
result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)

# 获取特征重要性
importances = result.importances_mean
std = result.importances_std

for i in range(X_test.shape[1]):
print(f"Feature {i}, Importance: {importances[i]:.3f} ± {std[i]:.3f}")


### 15. 自动特征工程

from tpot import TPOTRegressor

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
y_pred_tpot = tpot.predict(X_test)
mse_tpot = mean_squared_error(y_test, y_pred_tpot)
print(f"Mean Squared Error with TPOT: {mse_tpot}")


### 16. 模型解释性增强

import shap

explainer = shap.Explainer(model)
shap_values = explainer(X_test)

# 可视化单个预测的解释
shap.plots.waterfall(shap_values[0])


### 17. 集成模型的多样性

from sklearn.ensemble import RandomForestRegressor

# 创建其他回归器
tree_reg = RandomForestRegressor(n_estimators=100, random_state=42)
ensemble_estimators = [('lr', lr), ('ridge', ridge), ('tree', tree_reg)]

# 组合模型
ensemble = VotingRegressor(estimators=ensemble_estimators, voting='soft')  # soft voting for regression
ensemble.fit(X_train, y_train)
y_pred_ensemble_diverse = ensemble.predict(X_test)
mse_ensemble_diverse = mean_squared_error(y_test, y_pred_ensemble_diverse)
print(f"Mean Squared Error with Diverse Ensemble Regression: {mse_ensemble_diverse}")


### 18. 集成模型的权重调整

# 使用GridSearchCV确定子模型的权重
param_grid = [{

'weights': ['uniform', 'distance'], 'voting': ['hard', 'soft']}]
grid = GridSearchCV(ensemble, param_grid, refit=True, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

# 获取最佳参数
best_weights = grid.best_params_['weights']
best_voting = grid.best_params_['voting']

# 重新构建并评估模型
ensemble_best = VotingRegressor(estimators=ensemble_estimators, weights=best_weights, voting=best_voting)
ensemble_best.fit(X_train, y_train)
y_pred_ensemble_weighted = ensemble_best.predict(X_test)
mse_ensemble_weighted = mean_squared_error(y_test, y_pred_ensemble_weighted)
print(f"Mean Squared Error with Weighted Ensemble Regression: {mse_ensemble_weighted}")


### 19. 序列最小优化（SMO）

SMO算法是支持向量机（SVM）中的优化方法，但它也可用于线性回归，特别是处理大规模数据时。SMO可以有效地处理L1正则化，产生稀疏解：

from sklearn.linear_model import LassoLarsIC

lasso_lars = LassoLarsIC(criterion='bic')
lasso_lars.fit(X_train, y_train)
y_pred_lasso_lars = lasso_lars.predict(X_test)
mse_lasso_lars = mean_squared_error(y_test, y_pred_lasso_lars)
print(f"Mean Squared Error with LassoLarsIC: {mse_lasso_lars}")


### 20. 预测区间估计

from sklearn.linear_model import LinearRegression

# 训练模型
model = LinearRegression()
model.fit(X_train, y_train)

# 预测并计算标准误差
y_pred, y_std = model.predict(X_test, return_std=True)
mse_interval = mean_squared_error(y_test, y_pred + y_std)
print(f"Mean Squared Error with Prediction Intervals: {mse_interval}")


### 21. 动态特征选择

from sklearn.feature_selection import SelectKBest, f_regression

# 假设X_time是按时间顺序排列的特征数据
window_size = 7  # 一周的数据
X_train_window = X_train[-window_size:]
y_train_window = y_train[-window_size:]

# 在时间窗口内选择最相关的特征
selector = SelectKBest(score_func=f_regression, k=2)
X_train_window_selected = selector.fit_transform(X_train_window, y_train_window)

# 使用选定的特征训练模型
model_window = LinearRegression()
model_window.fit(X_train_window_selected, y_train_window)

# 预测并评估
y_pred_window = model_window.predict(selector.transform(X_test[-window_size:]))
mse_window = mean_squared_error(y_test[-window_size:], y_pred_window)
print(f"Mean Squared Error with Dynamic Feature Selection: {mse_window}")


### 22. 模型校验和调优

from sklearn.model_selection import GridSearchCV

# 假设我们有一个线性回归模型
model = LinearRegression()

# 定义参数网格
param_grid = {

'fit_intercept': [True, False], 'normalize': [True, False]}

# 使用GridSearchCV进行参数调优
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# 获取最佳参数
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# 使用最佳参数重新训练模型
model_best = LinearRegression(**best_params)
model_best.fit(X_train, y_train)

# 预测并评估
y_pred_best = model_best.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
print(f"Mean Squared Error with Best Parameters: {mse_best}")


### 23. 模型的稳定性分析

from sklearn.utils import resample

# 创建Bootstrap样本
n_samples = len(X_train)
bootstrap_indices = [resample(range(n_samples), replace=True, n_samples=n_samples) for _ in range(100)]

# 训练和评估Bootstrap模型
mse_bootstrap = []
for idx in bootstrap_indices:
X_bootstrap, y_bootstrap = X_train.iloc[idx], y_train.iloc[idx]
model_bootstrap = LinearRegression()
model_bootstrap.fit(X_bootstrap, y_bootstrap)
y_pred_bootstrap = model_bootstrap.predict(X_test)
mse_bootstrap.append(mean_squared_error(y_test, y_pred_bootstrap))

# 计算Bootstrap MSE的平均值和标准差
mse_bootstrap_avg = np.mean(mse_bootstrap)
mse_bootstrap_std = np.std(mse_bootstrap)
print(f"Bootstrap Mean Squared Error: {mse_bootstrap_avg:.3f} ± {mse_bootstrap_std:.3f}")


### 24. 迁移学习

# 假设有预训练模型的权重
pretrained_weights = ...

# 初始化模型并加载预训练权重
model = LinearRegression()
model.coef_ = pretrained_weights

# 在目标任务上微调模型
model.fit(X_train, y_train)

# 预测并评估
y_pred_transfer = model.predict(X_test)
mse_transfer = mean_squared_error(y_test, y_pred_transfer)
print(f"Mean Squared Error with Transfer Learning: {mse_transfer}")


## 实践案例

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 创建模拟数据
np.random.seed(0)
X = np.random.rand(100, 1) * 500  # 房屋面积
y = 2 * X + 3 + np.random.randn(100, 1)  # 价格 = 2 * 面积 + 3 + 噪声

# 将数据分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建并训练线性回归模型
model = LinearRegression()
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

# 评估
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


|
13天前
|

19 0
|
15天前
|

22 1
|
9月前
|

46 3
|
8月前
|

65 0
|
11月前
|

241 0
|

02 线性回归
02 线性回归
76 0
|

98 0

89 1
|

174 0
|

134 0