Приложение 2 - код на языке программирования Python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
pd.options.mode.chained_assignment = None # default='warn'
pd.set_option('display.max_columns', 500)
train_data = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
train_data['home_cost_log'] = np.log1p(train_data['home_cost'])
train_na = (train_data.isnull().sum() / len(train_data)) * 100
train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)
f, ax = plt.subplots(figsize=(12, 8))
plt.xticks(rotation='90')
sns.barplot(x=train_na.index, y=train_na)
ax.set(title='Percent missing data by feature', ylabel='% missing')
train_data.loc[train_data['state'] == 33, 'state'] = train_data['state'].mode().iloc[0]
train_data.loc[train_data['build_year'] == 20052009, 'build_year'] = 2007
internal_chars = ['allsq', 'lifesq', 'floor', 'max_floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'home_cost']
corrmat = train_data[internal_chars].corr()
f, ax = plt.subplots(figsize=(10, 7))
plt.xticks(rotation='90')
sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)
import datetime
import matplotlib.dates as mdates
years = mdates.YearLocator() # every year
yearsFmt = mdates.DateFormatter('%Y')
ts_vc = train_data['timestamp'].value_cnts()
f, ax = plt.subplots(figsize=(12, 6))
plt.bar(left=ts_vc.index, height=ts_vc)
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(yearsFmt)
ax.set(title='Sales volume over time', ylabel='Number of transactions')
train_data['area_dist'] = train_data['area_m'] / 1000000
train_data['density'] = train_data['raion_popul'] / train_data['area_dist']
f, ax = plt.subplots(figsize=(10, 6))
sa_cost = train_data.groupby('sub')[['density', 'home_cost']].median()
sns.regplot(x="density", y="home_cost", data=sa_cost, scatter=True, truncate=True)
ax.set(title='Median home cost by raion population density (people per sq. km)')
train_data['job_share'] = train_data['job_all'] / train_data['raion_popul']
f, ax = plt.subplots(figsize=(12, 6))
sa_cost = train_data.groupby('sub')[['job_share', 'home_cost']].mean()
sns.regplot(x="job_share", y="home_cost", data=sa_cost, scatter=True, order=4, truncate=True)
ax.set(title='District mean home cost by share of jobing age population')
f, ax = plt.subplots(figsize=(12, 8))
sns.stripplot(x="university_top_20_raion", y="home_cost", data=train_data, jitter=True, alpha=.2, color=".8");
sns.boxplot(x="university_top_20_raion", y="home_cost", data=train_data)
ax.set(title='Distribution of home cost by # of top universities in Raion', xlabel='university_top_20_raion',
ylabel='home_cost')
f, ax = plt.subplots(figsize=(10, 6))
sns.regplot(x="kremlin_dist", y="home_cost", data=train_data, scatter=True, truncate=True, scatter_kws={'color': 'r', 'alpha':.2})
ax.set(title='Home cost by distance to Kremlin')
test_data = pd.read_csv("../input/test.csv", parse_dates=['timestamp'])
test_na = (test_data.isnull().sum() / len(test_data)) * 100
test_na = test_na.drop(test_na[test_na == 0].index).sort_values(ascending=False)
all_data = pd.concat([train_data.drop('home_cost', axis=1), test_data])
all_data['dataset'] = ''
l = len(train_data)
all_data.iloc[:l]['dataset'] = 'train'
all_data.iloc[l:]['dataset'] = 'test'
train_dataset = all_data['dataset'] == 'train'
years = mdates.YearLocator()
yearsFmt = mdates.DateFormatter('%Y')
ts_vc_train = train_data['timestamp'].value_cnts()
ts_vc_test = test_data['timestamp'].value_cnts()
f, ax = plt.subplots(figsize=(12, 6))
plt.bar(left=ts_vc_train.index, height=ts_vc_train)
plt.bar(left=ts_vc_test.index, height=ts_vc_test)
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(yearsFmt)
ax.set(title='Num truns day', ylabel='cnt')
from sklearn.metrics import make_scorer
def rmsle_exp(y_true_log, y_pred_log):
y_true = np.exp(y_true_log)
y_pred = np.exp(y_pred_log)
return np.sqrt(np.mean(np.power(np.log(y_true + 1) - np.log(y_pred + 1), 2)))
def score_model(model, pipe):
train_error = rmsle_exp(y_train, model.predict(pipe.transform(X_train)))
test_error = rmsle_exp(y_test, model.predict(pipe.transform(X_test)))
return train_error, test_error
from sklearn.linear_model import LinearRegression
lr = LinearRegression(fit_intercept=True)
lr.fit(pipe.transform(X_train), y_train)
print("Train error: {:.4f}, Test error: {:.4f}".format(*score_model(lr, pipe)))
from sklearn.svm import SVR
svr = SVR()
svr.fit(pipe.transform(X_train), y_train)
print("Train error: {:.4f}, Test error: {:.4f}".format(*score_model(svr, pipe)))
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100, min_samples_leaf=50, n_jobs=-1)
rfr.fit(pipe.transform(X_train), y_train)
print("Train error: {:.4f}, Test error: {:.4f}".format(*score_model(rfr, pipe)))
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(pipe.transform(X_train), y_train)
print("Train error: {:.4f}, Test error: {:.4f}".format(*score_model(xgb, pipe)))
pipe.fit(X)
xgb.fit(pipe.transform(X), y)
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
objective='reg:linear', reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, seed=0, silent=True, subsample=1)
test_data = pd.merge(test, macro, how='left', on='timestamp')
test_data['timestamp'] = pd.to_numeric(pd.to_datetime(test_data['timestamp'])) / 1e18
test_data = pd.get_dummies(test_data).astype(np.float64)
df_test = pd.DataFrame(columns=df.columns)
for column in df_test.columns:
if column in test_data.columns:
df_test[column] = test_data[column]
else:
df_test[column] = np.nan
predict_data= pd.DataFrame()
predict_data['id'] = test['id']
predict_data['home_cost'] = pred
predict_data.head()
predict_data.to_csv('pred.csv', index=False)