news 2026/4/16 14:09:58

DAY36 复习日

作者头像

张小明

前端开发工程师

1.2k 24
文章封面图
DAY36 复习日

我们使用了神经网络的方式,用了pytorch重新对信贷数据集进行处理。

import pandas as pd import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix import matplotlib.pyplot as plt import seaborn as sns import os # 设置随机种子以保证结果可复现 torch.manual_seed(42) np.random.seed(42) # --- 1. 数据预处理 --- def load_and_preprocess_data(filepath): print("Loading data...") df = pd.read_csv(filepath) # 删除 Id 列 if 'Id' in df.columns: df = df.drop('Id', axis=1) print(f"Original shape: {df.shape}") # 处理 'Current Loan Amount' 异常值 (99999999.0 通常表示无限制或错误) # 替换为 NaN 然后进行插补,或者替换为最大有效值。 # 这里我们替换为 NaN 并使用中位数插补,如果数量太多也可以直接删除。 # 先检查一下数量。 outlier_mask = df['Current Loan Amount'] == 99999999.0 df.loc[outlier_mask, 'Current Loan Amount'] = np.nan # 解析 'Years in current job' # 映射关系: '< 1 year'->0, '1 year'->1, ..., '10+ years'->10 def parse_years(x): if pd.isna(x): return np.nan if '<' in x: return 0 if '+' in x: return 10 return int(x.split()[0]) df['Years in current job'] = df['Years in current job'].apply(parse_years) # 插补缺失值 # 数值列 num_cols = df.select_dtypes(include=[np.number]).columns for col in num_cols: if col != 'Credit Default': # 使用中位数以增强鲁棒性 median_val = df[col].median() df[col].fillna(median_val, inplace=True) # 类别列 cat_cols = df.select_dtypes(include=['object']).columns for col in cat_cols: mode_val = df[col].mode()[0] df[col].fillna(mode_val, inplace=True) # 编码类别变量 (One-Hot 编码) df = pd.get_dummies(df, columns=cat_cols, drop_first=True) print(f"Processed shape: {df.shape}") # 划分数据 X = df.drop('Credit Default', axis=1).values y = df['Credit Default'].values # 70% 训练集, 15% 验证集, 15% 测试集 # 第一次划分: 训练集 (70%) 和 临时集 (30%) X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) # 第二次划分: 验证集 (总量的 15% -> 临时集的 50%) 和 测试集 (总量的 15% -> 临时集的 50%) X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp) # 标准化 scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_val = scaler.transform(X_val) X_test = scaler.transform(X_test) return X_train, y_train, X_val, y_val, X_test, y_test, df.drop('Credit Default', axis=1).columns # --- 2. PyTorch 数据集 --- class CreditDataset(Dataset): def __init__(self, X, y): self.X = torch.FloatTensor(X) self.y = torch.FloatTensor(y).unsqueeze(1) # 二分类需要 (N, 1) 的形状 def __len__(self): return len(self.X) def __getitem__(self, idx): return self.X[idx], self.y[idx] # --- 3. 神经网络模型 --- class CreditNN(nn.Module): def __init__(self, input_dim): super(CreditNN, self).__init__() # 3 个隐藏层: 128 -> 64 -> 32 self.layer1 = nn.Linear(input_dim, 128) self.relu1 = nn.ReLU() self.dropout1 = nn.Dropout(0.3) self.layer2 = nn.Linear(128, 64) self.relu2 = nn.ReLU() self.dropout2 = nn.Dropout(0.3) self.layer3 = nn.Linear(64, 32) self.relu3 = nn.ReLU() self.output = nn.Linear(32, 1) self.sigmoid = nn.Sigmoid() def forward(self, x): x = self.dropout1(self.relu1(self.layer1(x))) x = self.dropout2(self.relu2(self.layer2(x))) x = self.relu3(self.layer3(x)) x = self.sigmoid(self.output(x)) return x # --- 4. 训练函数 --- def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100, patience=10): train_losses = [] val_losses = [] train_accs = [] val_accs = [] best_val_loss = float('inf') epochs_no_improve = 0 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) print(f"Training on {device}") for epoch in range(num_epochs): model.train() running_loss = 0.0 correct_train = 0 total_train = 0 for X_batch, y_batch in train_loader: X_batch, y_batch = X_batch.to(device), y_batch.to(device) optimizer.zero_grad() outputs = model(X_batch) loss = criterion(outputs, y_batch) loss.backward() optimizer.step() running_loss += loss.item() predicted = (outputs > 0.5).float() total_train += y_batch.size(0) correct_train += (predicted == y_batch).sum().item() epoch_train_loss = running_loss / len(train_loader) epoch_train_acc = correct_train / total_train train_losses.append(epoch_train_loss) train_accs.append(epoch_train_acc) # 验证 model.eval() running_val_loss = 0.0 correct_val = 0 total_val = 0 with torch.no_grad(): for X_batch, y_batch in val_loader: X_batch, y_batch = X_batch.to(device), y_batch.to(device) outputs = model(X_batch) loss = criterion(outputs, y_batch) running_val_loss += loss.item() predicted = (outputs > 0.5).float() total_val += y_batch.size(0) correct_val += (predicted == y_batch).sum().item() epoch_val_loss = running_val_loss / len(val_loader) epoch_val_acc = correct_val / total_val val_losses.append(epoch_val_loss) val_accs.append(epoch_val_acc) print(f"Epoch [{epoch+1}/{num_epochs}] " f"Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f} | " f"Val Loss: {epoch_val_loss:.4f} Acc: {epoch_val_acc:.4f}") # 早停检查 if epoch_val_loss < best_val_loss: best_val_loss = epoch_val_loss epochs_no_improve = 0 # 保存最佳模型 torch.save(model.state_dict(), 'best_credit_model.pth') else: epochs_no_improve += 1 if epochs_no_improve >= patience: print("Early stopping triggered!") break return train_losses, val_losses, train_accs, val_accs # --- 5. 评估与可视化 --- def evaluate_model(model, test_loader, feature_names): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.load_state_dict(torch.load('best_credit_model.pth')) model.to(device) model.eval() y_true = [] y_pred = [] y_scores = [] with torch.no_grad(): for X_batch, y_batch in test_loader: X_batch = X_batch.to(device) outputs = model(X_batch) y_scores.extend(outputs.cpu().numpy()) predicted = (outputs > 0.5).float() y_pred.extend(predicted.cpu().numpy()) y_true.extend(y_batch.numpy()) y_true = np.array(y_true) y_pred = np.array(y_pred) y_scores = np.array(y_scores) # 指标计算 acc = accuracy_score(y_true, y_pred) prec = precision_score(y_true, y_pred) rec = recall_score(y_true, y_pred) auc = roc_auc_score(y_true, y_scores) print("\n--- Test Set Evaluation ---") print(f"Accuracy: {acc:.4f}") print(f"Precision: {prec:.4f}") print(f"Recall: {rec:.4f}") print(f"AUC: {auc:.4f}") # 混淆矩阵 cm = confusion_matrix(y_true, y_pred) plt.figure(figsize=(6, 5)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') plt.title('Confusion Matrix') plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix.png') print("Saved confusion_matrix.png") # 可视化第一层权重 (特征重要性近似) # 我们取每个输入特征的权重的绝对值均值来观察其贡献 weights = model.layer1.weight.data.cpu().numpy() feature_importance = np.mean(np.abs(weights), axis=0) # 特征排序 sorted_idx = np.argsort(feature_importance)[-10:] # 取前10个 plt.figure(figsize=(10, 6)) plt.barh(range(10), feature_importance[sorted_idx]) plt.yticks(range(10), feature_names[sorted_idx]) plt.xlabel('Mean Absolute Weight') plt.title('Top 10 Feature Importance (Layer 1 Weights)') plt.savefig('feature_importance.png') print("Saved feature_importance.png") # --- 主程序执行 --- if __name__ == "__main__": # 加载数据 data_path = 'e:\\桌面\\Python60DaysChallenge-main\\data.csv' X_train, y_train, X_val, y_val, X_test, y_test, feature_names = load_and_preprocess_data(data_path) # 创建 DataLoader batch_size = 64 train_dataset = CreditDataset(X_train, y_train) val_dataset = CreditDataset(X_val, y_val) test_dataset = CreditDataset(X_test, y_test) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size) test_loader = DataLoader(test_dataset, batch_size=batch_size) # 初始化模型 input_dim = X_train.shape[1] model = CreditNN(input_dim) print(model) # 损失函数和优化器 criterion = nn.BCELoss() optimizer = optim.Adam(model.parameters(), lr=0.001) # 训练 train_losses, val_losses, train_accs, val_accs = train_model( model, train_loader, val_loader, criterion, optimizer, num_epochs=100, patience=10 ) # 绘制训练历史 plt.figure(figsize=(12, 5)) plt.subplot(1, 2, 1) plt.plot(train_losses, label='Train Loss') plt.plot(val_losses, label='Val Loss') plt.title('Loss Curve') plt.legend() plt.subplot(1, 2, 2) plt.plot(train_accs, label='Train Acc') plt.plot(val_accs, label='Val Acc') plt.title('Accuracy Curve') plt.legend() plt.savefig('training_history.png') print("Saved training_history.png") # 评估 evaluate_model(model, test_loader, feature_names)

结果如下:

版权声明: 本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若内容造成侵权/违法违规/事实不符,请联系邮箱:809451989@qq.com进行投诉反馈,一经查实,立即删除!
网站建设 2026/4/16 5:57:48

三子棋游戏(函数)

1.文件结构- test.c &#xff1a;测试游戏逻辑 - game.c &#xff1a;游戏代码的实现 - game.h &#xff1a;游戏代码的声明&#xff08;函数声明、符号定义&#xff09;test.c 代码内容#include <stdio.h> #include "game.h"// 菜单函数 void menu() {pri…

作者头像 李华
网站建设 2026/4/16 5:21:10

实习面试题-Go 性能优化面试题

1.如何利用 Go 语言特性设计一个 QPS 为 500 的服务器? 回答重点 思路 1) 回答 QPS 高低和什么因素有关 2) 指出题意的问题,条件变量很多,500的定义较为模糊。 3) 回答 GO 有哪些方式可以提高QPS。 4) 如何测试与分析机器的QPS。 示例回答 面试官好,QPS影响因素…

作者头像 李华
网站建设 2026/4/16 7:22:16

Wan2.2-T2V-A14B在历史事件复原视频中的考据严谨性评估

Wan2.2-T2V-A14B在历史事件复原视频中的考据严谨性评估 在数字人文与智能创作交汇的今天&#xff0c;我们正见证一场视觉叙事方式的根本性变革。当一部关于“安史之乱”的教学短片能在几分钟内由AI生成&#xff0c;且画面中士兵铠甲纹路、旗帜形制、建筑斗拱比例皆有考古依据时…

作者头像 李华
网站建设 2026/4/15 21:59:54

【Linux 系统编程核心】进程的本质、管理与核心操作

一、进程的核心定义与本质1.1 进程是什么&#xff1f;进程是程序的一次执行过程&#xff0c;是操作系统分配内存、CPU 等资源的基本单位。简单来说&#xff1a;程序&#xff1a;存储在硬盘上的代码、数据的静态集合&#xff08;如a.out、ls命令&#xff09;&#xff1b;进程&am…

作者头像 李华