Administrator 1 месяц назад
Родитель
Сommit
811105be4d
1 измененных файлов с 40 добавлено и 18 удалено
  1. 40 18
      train_corrector.py

+ 40 - 18
train_corrector.py

@@ -1,11 +1,10 @@
 import pandas as pd
 import joblib
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.model_selection import train_test_split
 
 # === 配置路径 ===
-csv_path = 'C:\\Users\\Administrator\\Desktop\\defrost\\feedback_data.csv'
-model_save_path = "defrost_time_corrector.pkl"
+csv_path = 'C:\\Users\\Administrator\\Desktop\\defrost\\feedback_data.csv'  # 你的csv
+model_save_path = "defrost_time_corrector.pkl"  # 模型保存路径
 
 # === 特征列定义 ===
 feature_columns = [
@@ -14,21 +13,43 @@ feature_columns = [
 ]
 
 # === 1. 读取CSV并预处理 ===
-df = pd.read_csv(csv_path, parse_dates=["t_formula", "t_real"], encoding='gbk')
-
-# 确保字段类型正确
-df["material_name"] = df["material_name"].astype(str)
-df["manufactured_goods"] = df["manufactured_goods"].astype(str)
+try:
+    df = pd.read_csv(csv_path, parse_dates=["t_formula", "t_real"], encoding='gbk')
+    print(f"✅ 成功读取CSV文件,共{len(df)}条数据")
+except Exception as e:
+    print(f"❌ 读取CSV失败: {e}")
+    exit(1)
+
+# 确保字段类型正确(如果这两列存在)
+for col in ["material_name", "manufactured_goods"]:
+    if col in df.columns:
+        df[col] = df[col].astype(str)
 
 # 计算真实解冻时长(小时)
 df["t_real_hours"] = (df["t_real"] - df["t_formula"]).dt.total_seconds() / 3600
 
-# === 2. 训练模型(用已有所有历史数据) ===
+# 检查有没有缺失特征
+missing_features = [col for col in feature_columns if col not in df.columns]
+if missing_features:
+    print(f"❌ 缺少必要特征列: {missing_features}")
+    exit(1)
+
+# === 2. 智能训练模型 ===
 X = df[feature_columns]
 y = df["t_real_hours"]
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-
+if len(X) >= 10:
+    # 数据够多,做train_test_split
+    from sklearn.model_selection import train_test_split
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    print(f"📚 数据量 {len(X)},已划分训练集和测试集")
+else:
+    # 数据少,直接全量训练
+    X_train, y_train = X, y
+    X_test, y_test = None, None
+    print(f"⚠️ 数据量太少({len(X)}条),直接全量训练")
+
+# 建立随机森林回归模型
 model = RandomForestRegressor(n_estimators=100, random_state=42)
 model.fit(X_train, y_train)
 
@@ -36,10 +57,8 @@ model.fit(X_train, y_train)
 joblib.dump(model, model_save_path)
 print(f"✅ 模型训练完成,已保存为 {model_save_path}")
 
-# === 3. 用最新数据预测(比如最后一条或多条) ===
-
-# 假设你要预测最后新增的一条数据(如果多条可以改)
-new_sample = df.tail(1)  # 取最后一行,也可以是 tail(n) 最后n行
+# === 3. 预测最新一条数据 ===
+new_sample = df.tail(1)  # 取最后一行
 
 X_new = new_sample[feature_columns]
 predicted_time = model.predict(X_new)[0]
@@ -48,8 +67,11 @@ predicted_time = model.predict(X_new)[0]
 df.loc[new_sample.index, "predicted_t_real_hours"] = predicted_time
 
 # === 4. 保存带预测值的CSV ===
-df.to_csv(csv_path, encoding='gbk', index=False)
-print(f"✅ 最新数据预测完成,已更新到 {csv_path}")
+try:
+    df.to_csv(csv_path, encoding='gbk', index=False)
+    print(f"✅ 最新数据预测完成,已更新到 {csv_path}")
+except Exception as e:
+    print(f"❌ 保存CSV失败: {e}")
 
+# === 5. 打印最终预测结果 ===
 print(f"\n📊 预测最后一条数据真实解冻时间为:{predicted_time:.2f} 小时")
-