import pandas as pd # save filepath to variable for easier access melbourne_file_path = '../input/melbourne-housing-snapshot/melb_data.csv' # read the data and store data in DataFrame titled melbourne_data melbourne_data = pd.read_csv(melbourne_file_path)
了解数据
1 2 3 4 5
# print a summary of the data in Melbourne data melbourne_data.describe()
# 显示所有列索引 print(melbourne_data.columns)
设定Target
1
y = melbourne_data['SalePrice']
设定Input
在所有列索引中选择需要的作为输入特征,如
1 2 3 4 5
feature_names = ['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd'] X = melbourne_data[feature_names] # 观察输入数据 print(X.describe())
数据划分 train_X、train_y、val_X、val_y
1 2
from sklearn.model_selection import train_test_split train_X, train_y, val_X, val_y = train_test_split(X,y,random_state=1)
配置和拟合Model
以DecisionTreeRegressor为例
1 2 3
from sklearn.tree import DecisionTreeRegressor melbourne_model = DecisionTreeRegressor(random_state=1) melbourne_model.fit(train_X, train_y)
做预测
输入X为例
1
predictions = melbourne_model.predict(val_X)
模型验证
以Mean Absolute Error(MAE)为例
1 2
from sklearn.metrics import mean_absolute_error val_mae = mean_absolute_error(melbourne_model.predict(val_X), val_y)
调参时候的操作方法
以寻找最大叶节点为例
1 2 3 4 5 6 7 8 9 10 11 12 13
defget_mae(max_leaf_nodes, train_X, val_X, train_y, val_y): model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0) model.fit(train_X, train_y) preds_val = model.predict(val_X) mae = mean_absolute_error(val_y, preds_val) return(mae)
# 使用不同的最大叶节点 candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500] # Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500) maes = [get_mae(mln,train_X, val_X, train_y, val_y) for mln in candidate_max_leaf_nodes] best_tree_size = candidate_max_leaf_nodes[maes.index(min(maes))] print(best_tree_size)