Kaggle竞赛-Titanic

刘二大人 第08讲 课后练习

题目地址:Titanic - Machine Learning from Disaster | Kaggle

干了兄弟们,该说的都在注释里了。

Pandas基本操作参考本网站另一篇博文:Pandas快速入门指南:CSV操作与数据清洗核心 - GuTaicheng’s Blog

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class TrainDataset(Dataset):
def __init__(self, filepath):
# 1. 加载数据
df = pd.read_csv(filepath)

# A. 目标变量 (Y): 索引 1 (Survived)
y_numpy = df.iloc[:, 1].values.astype(np.float32)
self.y_data = torch.from_numpy(y_numpy).unsqueeze(dim=1)

# B. 填充缺失值:用均值填充 Age 和 Fare 的缺失值
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

# C. 处理分类特征:
# C1. Sex (二元编码)
df['Sex_Encoded'] = df['Sex'].map({'male': 1, 'female': 0}).astype(np.float32)

# C2. Embarked (独热编码)
# 填充缺失值(使用最常见的值 'S')
df['Embarked'] = df['Embarked'].fillna('S')
df_embarked_onehot = pd.get_dummies(df['Embarked'], prefix='Embarked', drop_first=True)
df = pd.concat([df, df_embarked_onehot], axis=1)

# D. 选择最终特征列 (X)
# 假设你需要的特征最终是:
# Pclass (idx 2), Age (idx 5), SibSp (idx 6), Parch (idx 7), Fare (idx 9)
# 加上我们新增的 'Sex_Encoded', 'Embarked_Q', 'Embarked_S'
feature_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_Encoded']

# 将独热编码的列名也加入列表
feature_columns.extend(df_embarked_onehot.columns.tolist())

# 从 DataFrame 中提取最终的特征矩阵
df_x_final = df[feature_columns]

# 4. 转换特征为 Tensor
x_numpy = df_x_final.values.astype(np.float32)
self.x_data = torch.from_numpy(x_numpy)

# 5. 设置长度
self.len = self.x_data.shape[0]

def __getitem__(self, index):
return self.x_data[index], self.y_data[index]

def __len__(self):
return self.len

class TestDataset(Dataset):
def __init__(self, filepath):
# 1. 加载数据
df = pd.read_csv(filepath)
self.passenger_ids = df['PassengerId'].values

# A. 填充缺失值:用均值填充 Age 和 Fare 的缺失值
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

# B. 处理分类特征:
# B1. Sex (二元编码)
df['Sex_Encoded'] = df['Sex'].map({'male': 1, 'female': 0}).astype(np.float32)

# B2. Embarked (独热编码)
# 填充缺失值(使用最常见的值 'S')
df['Embarked'] = df['Embarked'].fillna('S')
df_embarked_onehot = pd.get_dummies(df['Embarked'], prefix='Embarked', drop_first=True)
df = pd.concat([df, df_embarked_onehot], axis=1)

# C. 选择最终特征列 (X)
# 假设你需要的特征最终是:
# Pclass (idx 1), Age (idx 4), SibSp (idx 5), Parch (idx 6), Fare (idx 8)
# 加上我们新增的 'Sex_Encoded', 'Embarked_Q', 'Embarked_S'
feature_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_Encoded']

# 将独热编码的列名也加入列表
feature_columns.extend(df_embarked_onehot.columns.tolist())

# 从 DataFrame 中提取最终的特征矩阵
df_x_final = df[feature_columns]

# 4. 转换特征为 Tensor
x_numpy = df_x_final.values.astype(np.float32)
self.x_data = torch.from_numpy(x_numpy)

# 5. 设置长度
self.len = self.x_data.shape[0]

def __getitem__(self, index):
return self.x_data[index], self.passenger_ids[index]

def __len__(self):
return self.len


train_dataset = TrainDataset(filepath='DataSet/titanic/train.csv')
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=0)

test_dataset = TestDataset(filepath='DataSet/titanic/test.csv')
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, num_workers=0)

class TitanicNet(nn.Module):
def __init__(self):
super(TitanicNet, self).__init__()
self.l1 = nn.Linear(8, 4)
self.l2 = nn.Linear(4, 2)
self.l3 = nn.Linear(2, 1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()

def forward(self, x):
x = self.relu(self.l1(x))
x = self.relu(self.l2(x))
x = self.sigmoid(self.l3(x))
return x

my_model = TitanicNet()
criterion = nn.BCELoss(reduction='mean') # <-- 新写法
optimizer = optim.SGD(my_model.parameters(), lr=0.05)


def train():
total_batch_loss = 0.0
num_batches = 0
for i, data in enumerate(train_loader):
x_data, y_data = data
optimizer.zero_grad()
y_pred = my_model(x_data)
loss = criterion(y_pred, y_data)
total_batch_loss += loss.item()
num_batches += 1
loss.backward()
optimizer.step()
avg_loss = total_batch_loss / num_batches
return avg_loss


def test():
"""在测试集上进行推理(无标签),生成预测结果和 ID。"""
my_model.eval() # 切换到评估模式
all_predictions = []
all_ids = []

print("\n--- 开始在测试集上进行推理 ---")

with torch.no_grad(): # 推理时禁用梯度
for x_data, passenger_ids in test_loader:
# 1. 前向传播:模型输出概率 (0-1)
outputs = my_model(x_data)

# 2. 预测类别:将概率 >= 0.6 视为幸存 (1),否则为未幸存 (0)
# 因为禁用梯度,所以output在这默认属于data
# 而data属于二维张量,也就是矩阵,所以形状是(batch_size, 1), 这里是(32, 1)
# (outputs >= 0.6) 转换为 32*1 的布尔矩阵
# 使用 .int() 转换为整数类型 32 * 1 的 零一矩阵
# 使用 .squeeze(dim=1) 转换为一维的向量 一行32个数值
predicted_labels = (outputs >= 0.7).int().squeeze(dim=1)

# 3. 收集结果
all_predictions.extend(predicted_labels.numpy())
all_ids.extend(passenger_ids.numpy())

# 4. 整合结果并打印
submission_df = pd.DataFrame({
'PassengerId': all_ids,
'Survived': all_predictions
})

print("--- 推理完成 ---")
print(f"生成的预测总数:{len(all_predictions)}")
print("前 5 个预测结果:")
print(submission_df.head())

# my_model.train() # 可选:如果紧接着要继续训练,切换回训练模式
return submission_df

if __name__ == '__main__':
for epoch_num in range(2000):
l = train()
if epoch_num % 200 == 199:
print(f"epoch_num = {epoch_num}, loss = {l}")

# 调用测试方法进行推理
submission = test()

# 可选:将结果保存为 CSV 文件
submission.to_csv('DataSet/titanic/submission.csv', index=False)

最好成绩

image-20251020223447853


Kaggle竞赛-Titanic
https://blog.gutaicheng.top/2025/10/19/Kaggle竞赛-Titanic/
作者
GuTaicheng
发布于
2025年10月19日
许可协议