|
|
|
@@ -0,0 +1,131 @@ |
|
|
|
# 1. 导入依赖库(Cambricon-PyTorch 镜像已预装 torch 和 torch_mlu) |
|
|
|
import torch |
|
|
|
import torch.nn as nn |
|
|
|
import torch.optim as optim |
|
|
|
from torch.utils.data import TensorDataset, DataLoader |
|
|
|
|
|
|
|
|
|
|
|
# 2. 验证 MLU 环境(关键:确认 MLU 设备可用) |
|
|
|
def check_mlu_env(): |
|
|
|
print("=" * 50) |
|
|
|
print("【MLU 环境验证】") |
|
|
|
# 检查 torch_mlu 是否导入成功 |
|
|
|
try: |
|
|
|
import torch_mlu |
|
|
|
print(f"✅ torch_mlu 版本: {torch_mlu.__version__}") |
|
|
|
except ImportError: |
|
|
|
raise ImportError("❌ 未找到 torch_mlu,请确认使用 Cambricon-PyTorch 镜像") |
|
|
|
|
|
|
|
# 检查 MLU 设备是否可用 |
|
|
|
if torch_mlu.is_available(): |
|
|
|
mlu_device_count = torch_mlu.device_count() |
|
|
|
print(f"✅ MLU 设备数量: {mlu_device_count}") |
|
|
|
print(f"✅ 当前使用 MLU 设备: {torch_mlu.get_device_name(0)}") # 默认使用第 0 张 MLU 卡 |
|
|
|
return torch.device("mlu:0") # 返回 MLU 设备对象 |
|
|
|
else: |
|
|
|
raise RuntimeError("❌ MLU 设备不可用,请确认集群已挂载 MLU 卡且驱动正常") |
|
|
|
|
|
|
|
|
|
|
|
# 3. 定义极简训练模型(单线性层,模拟分类/回归任务) |
|
|
|
class HelloMLUModel(nn.Module): |
|
|
|
def __init__(self, input_dim=10, output_dim=1): |
|
|
|
""" |
|
|
|
输入:input_dim 维特征(模拟样本特征) |
|
|
|
输出:output_dim 维结果(模拟分类/回归输出) |
|
|
|
""" |
|
|
|
super(HelloMLUModel, self).__init__() |
|
|
|
self.linear = nn.Linear(input_dim, output_dim) # 核心计算层 |
|
|
|
self.relu = nn.ReLU() # 激活函数(增加非线性) |
|
|
|
|
|
|
|
def forward(self, x): |
|
|
|
"""前向传播:定义数据流向""" |
|
|
|
out = self.linear(x) |
|
|
|
out = self.relu(out) |
|
|
|
return out |
|
|
|
|
|
|
|
|
|
|
|
# 4. 生成模拟训练数据(随机特征 + 随机标签,用于测试流程) |
|
|
|
def generate_sim_data(sample_num=100, input_dim=10): |
|
|
|
""" |
|
|
|
生成模拟数据集: |
|
|
|
- 特征:sample_num 个样本,每个样本 input_dim 维,服从正态分布 |
|
|
|
- 标签:sample_num 个标签,服从正态分布(模拟回归任务) |
|
|
|
""" |
|
|
|
features = torch.randn(sample_num, input_dim) # 特征 shape: (100, 10) |
|
|
|
labels = torch.randn(sample_num, 1) # 标签 shape: (100, 1) |
|
|
|
# 封装为 TensorDataset(便于 DataLoader 加载) |
|
|
|
dataset = TensorDataset(features, labels) |
|
|
|
# 构建 DataLoader(批量加载数据,模拟真实训练的数据读取流程) |
|
|
|
dataloader = DataLoader(dataset, batch_size=10, shuffle=True) # 批大小 10,打乱数据 |
|
|
|
return dataloader |
|
|
|
|
|
|
|
|
|
|
|
# 5. 核心训练流程(适配 MLU 设备) |
|
|
|
def train_hello_mlu(model, dataloader, device, epochs=5): |
|
|
|
print("\n" + "=" * 50) |
|
|
|
print(f"【开始 MLU 训练】共 {epochs} 轮") |
|
|
|
print("=" * 50) |
|
|
|
|
|
|
|
# 定义损失函数(均方误差,适合回归任务) |
|
|
|
criterion = nn.MSELoss() |
|
|
|
# 定义优化器(随机梯度下降,更新模型参数) |
|
|
|
optimizer = optim.SGD(model.parameters(), lr=0.01) # 学习率 0.01 |
|
|
|
|
|
|
|
# 将模型迁移到 MLU 设备 |
|
|
|
model.to(device) |
|
|
|
|
|
|
|
# 训练循环(每轮遍历所有数据) |
|
|
|
for epoch in range(epochs): |
|
|
|
model.train() # 开启训练模式(影响 Dropout、BN 等层,此处无但规范保留) |
|
|
|
total_loss = 0.0 # 统计每轮总损失 |
|
|
|
|
|
|
|
# 批量加载数据并训练 |
|
|
|
for batch_idx, (batch_features, batch_labels) in enumerate(dataloader): |
|
|
|
# 1. 将数据迁移到 MLU 设备(关键:确保计算在 MLU 上执行) |
|
|
|
batch_features = batch_features.to(device) |
|
|
|
batch_labels = batch_labels.to(device) |
|
|
|
|
|
|
|
# 2. 梯度清零(避免上一轮梯度累积) |
|
|
|
optimizer.zero_grad() |
|
|
|
|
|
|
|
# 3. 前向传播:模型预测 |
|
|
|
outputs = model(batch_features) |
|
|
|
|
|
|
|
# 4. 计算损失 |
|
|
|
loss = criterion(outputs, batch_labels) |
|
|
|
|
|
|
|
# 5. 反向传播:计算梯度 |
|
|
|
loss.backward() |
|
|
|
|
|
|
|
# 6. 优化器更新参数 |
|
|
|
optimizer.step() |
|
|
|
|
|
|
|
# 累加损失(用于打印日志) |
|
|
|
total_loss += loss.item() * batch_features.size(0) |
|
|
|
|
|
|
|
# 计算每轮平均损失 |
|
|
|
avg_loss = total_loss / len(dataloader.dataset) |
|
|
|
|
|
|
|
# 打印每轮训练结果(Hello World 级别的输出反馈) |
|
|
|
print(f"Epoch [{epoch + 1}/{epochs}] | 平均损失: {avg_loss:.6f} | 设备: {device}") |
|
|
|
|
|
|
|
print("\n" + "=" * 50) |
|
|
|
print("【MLU 训练完成】✅ Hello MLU Training!") |
|
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
|
|
|
# 6. 主函数(串联所有流程) |
|
|
|
if __name__ == "__main__": |
|
|
|
# 步骤1:验证 MLU 环境并获取设备 |
|
|
|
mlu_device = check_mlu_env() |
|
|
|
|
|
|
|
# 步骤2:初始化模型(输入维度 10,输出维度 1) |
|
|
|
model = HelloMLUModel(input_dim=10, output_dim=1) |
|
|
|
print(f"\n✅ 模型初始化完成: {model}") |
|
|
|
|
|
|
|
# 步骤3:生成模拟训练数据 |
|
|
|
train_dataloader = generate_sim_data(sample_num=100, input_dim=10) |
|
|
|
print(f"✅ 模拟数据生成完成: 共 {len(train_dataloader.dataset)} 个样本,{len(train_dataloader)} 个批次") |
|
|
|
|
|
|
|
# 步骤4:启动 MLU 训练 |
|
|
|
train_hello_mlu(model, train_dataloader, mlu_device, epochs=5) |