Python统计建模
# Python统计建模 - 完整代码示例
# 统计建模使用统计模型分析数据关系,包括线性回归、广义线性模型等
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
# 由于环境可能没有 statsmodels,我们使用手动实现 + 可运行的完整代码
# 同时展示 statsmodels 的 API 风格
# ======== 第一部分:普通最小二乘 (OLS) 回归 ========
np.random.seed(42)
n = 100
# 生成模拟数据
X1 = np.random.normal(0, 1, n)
X2 = np.random.normal(0, 1, n)
# 真实模型: y = 3 + 2*X1 - 1.5*X2 + 误差
true_intercept, true_b1, true_b2 = 3.0, 2.0, -1.5
y = true_intercept + true_b1 * X1 + true_b2 * X2 + np.random.normal(0, 1.5, n)
# 手动实现 OLS 估计
# y = X * beta + epsilon, beta_hat = (X'X)^(-1) X'y
X_design = np.column_stack([np.ones(n), X1, X2]) # 添加常数项
beta_hat = np.linalg.inv(X_design.T @ X_design) @ X_design.T @ y
y_pred = X_design @ beta_hat
residuals = y - y_pred
# 残差方差和标准误
n_params = X_design.shape[1]
sigma2_hat = np.sum(residuals**2) / (n - n_params)
cov_beta = sigma2_hat * np.linalg.inv(X_design.T @ X_design)
se_beta = np.sqrt(np.diag(cov_beta))
# t 统计量和 p-value
t_stats = beta_hat / se_beta
p_values = 2 * (1 - stats.t.cdf(np.abs(t_stats), n - n_params))
print("OLS 回归结果:")
print(f"{'参数':10s} {'估计值':10s} {'标准误':10s} {'t 值':10s} {'p 值':10s}")
print(f"{'常数':10s} {beta_hat[0]:10.4f} {se_beta[0]:10.4f} {t_stats[0]:10.4f} {p_values[0]:10.6f}")
print(f"{'X1':10s} {beta_hat[1]:10.4f} {se_beta[1]:10.4f} {t_stats[1]:10.4f} {p_values[1]:10.6f}")
print(f"{'X2':10s} {beta_hat[2]:10.4f} {se_beta[2]:10.4f} {t_stats[2]:10.4f} {p_values[2]:10.6f}")
# 模型整体显著性 (F 检验)
y_mean = np.mean(y)
ss_reg = np.sum((y_pred - y_mean)**2) # 回归平方和
ss_res = np.sum(residuals**2) # 残差平方和
ss_tot = ss_reg + ss_res # 总平方和
df_reg = n_params - 1
df_res = n - n_params
f_stat = (ss_reg / df_reg) / (ss_res / df_res)
f_p_value = 1 - stats.f.cdf(f_stat, df_reg, df_res)
# R² 和调整 R²
r_squared = ss_reg / ss_tot
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - n_params)
print(f"\n模型汇总:")
print(f"R² = {r_squared:.4f}")
print(f"调整 R² = {adj_r_squared:.4f}")
print(f"F({df_reg},{df_res}) = {f_stat:.4f}, p-value = {f_p_value:.6f}")
print(f"残差标准差 = {np.sqrt(sigma2_hat):.4f}")
# 2. 残差诊断
# 残差应为随机分布,不应有明显模式
residuals_std = residuals / np.sqrt(sigma2_hat)
print(f"\n残差分析:")
print(f"残差均值: {np.mean(residuals):.4f} (应接近 0)")
print(f"残差标准差: {np.std(residuals):.4f}")
# Durbin-Watson 统计量(检验残差自相关)
dw = np.sum(np.diff(residuals)**2) / np.sum(residuals**2)
print(f"Durbin-Watson: {dw:.4f} (接近 2 表示无自相关)")
# 3. 预测和预测区间
X_new = np.array([[1, 1.5, 0.5], # 新数据点 (含常数项)
[1, -0.5, 1.0]])
y_new_pred = X_new @ beta_hat
# 预测区间
for i in range(len(X_new)):
x_new = X_new[i]
pred_var = sigma2_hat * (1 + x_new @ cov_beta @ x_new)
pred_se = np.sqrt(pred_var)
t_crit = stats.t.ppf(0.975, n - n_params)
ci_lower = y_new_pred[i] - t_crit * pred_se
ci_upper = y_new_pred[i] + t_crit * pred_se
print(f"\n新观测 {i+1}: 预测值 = {y_new_pred[i]:.4f}")
print(f" 95% 预测区间: [{ci_lower:.4f}, {ci_upper:.4f}]")
# ======== 第二部分:广义线性模型 (GLM) ========
# 4. Logistic 回归 (二分类)
# 生成二分类数据
np.random.seed(123)
n_class = 200
X_logit = np.random.normal(0, 1, (n_class, 2))
# 真实 log-odds: log(p/(1-p)) = -1 + 2*X1 - 0.5*X2
log_odds = -1 + 2*X_logit[:, 0] - 0.5*X_logit[:, 1]
prob = 1 / (1 + np.exp(-log_odds)) # Sigmoid 函数
y_binary = np.random.binomial(1, prob)
# 手动实现 IRLS (迭代重加权最小二乘) 求解 Logistic 回归
beta_logit = np.zeros(3)
X_logit_design = np.column_stack([np.ones(n_class), X_logit])
for iteration in range(20):
eta = X_logit_design @ beta_logit
pi = 1 / (1 + np.exp(-eta)) # 预测概率
# 避免数值问题
pi = np.clip(pi, 1e-10, 1 - 1e-10)
# 权重矩阵 W = diag(pi * (1-pi))
W = np.diag(pi * (1 - pi))
# 调整后的响应变量
z = eta + (y_binary - pi) / (pi * (1 - pi))
# 加权最小二乘
XWX = X_logit_design.T @ W @ X_logit_design
XWz = X_logit_design.T @ W @ z
beta_logit_new = np.linalg.solve(XWX, XWz)
# 检查收敛
if np.max(np.abs(beta_logit_new - beta_logit)) < 1e-6:
print(f"\nLogistic 回归收敛于迭代 {iteration+1}")
break
beta_logit = beta_logit_new
print(f"\nLogistic 回归结果:")
print(f"常数: {beta_logit[0]:.4f}, X1: {beta_logit[1]:.4f}, X2: {beta_logit[2]:.4f}")
# 预测准确率
y_pred_prob = 1 / (1 + np.exp(-X_logit_design @ beta_logit))
y_pred_class = (y_pred_prob > 0.5).astype(int)
accuracy = np.mean(y_pred_class == y_binary)
print(f"分类准确率: {accuracy:.4f}")
# ======== 第三部分:模型选择 (AIC / BIC) ========
# 比较不同复杂度模型的 AIC 和 BIC
print(f"\n模型选择准则:")
for include_x2 in [True, False]:
if include_x2:
X_sub = X_design
k = 3
desc = "y ~ X1 + X2"
else:
X_sub = X_design[:, :2] # 只包括常数和 X1
k = 2
desc = "y ~ X1"
beta_sub = np.linalg.inv(X_sub.T @ X_sub) @ X_sub.T @ y
y_pred_sub = X_sub @ beta_sub
rss_sub = np.sum((y - y_pred_sub)**2)
aic = n * np.log(rss_sub / n) + 2 * k
bic = n * np.log(rss_sub / n) + k * np.log(n)
print(f" {desc:15s}: AIC={aic:.2f}, BIC={bic:.2f}, RSS={rss_sub:.2f}")
# ======== 第四部分:统计推断与 Bootstrap ========
# Bootstrap 法估计参数不确定性(不依赖正态假设)
n_bootstrap = 1000
bootstrap_betas = np.zeros((n_bootstrap, 3))
for i in range(n_bootstrap):
# 有放回重抽样
idx = np.random.choice(n, n, replace=True)
X_boot = X_design[idx]
y_boot = y[idx]
# 计算 OLS 估计
beta_boot = np.linalg.inv(X_boot.T @ X_boot) @ X_boot.T @ y_boot
bootstrap_betas[i] = beta_boot
# Bootstrap 标准误
boot_se = np.std(bootstrap_betas, axis=0)
# Bootstrap 置信区间
boot_ci_lower = np.percentile(bootstrap_betas, 2.5, axis=0)
boot_ci_upper = np.percentile(bootstrap_betas, 97.5, axis=0)
print(f"\nBootstrap 推断 (n={n_bootstrap}):")
print(f"{'参数':10s} {'估计值':10s} {'SE (解析)':10s} {'SE (Boot)':10s} {'95% CI (Boot)':20s}")
for i, name in enumerate(['常数', 'X1', 'X2']):
print(f"{name:10s} {beta_hat[i]:10.4f} {se_beta[i]:10.4f} {boot_se[i]:10.4f} "
f"[{boot_ci_lower[i]:.4f}, {boot_ci_upper[i]:.4f}]")
print("\n统计建模总结:OLS 是线性模型基础,GLM 扩展到非正态响应")
print("残差诊断和模型选择 (AIC/BIC) 是建模流程的关键环节")
