Диссертация: Анализ и прогнозирование цен на рынке недвижимости Москвы

Внимание! Если размещение файла нарушает Ваши авторские права, то обязательно сообщите нам

Приложение 2 - код на языке программирования Python

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None # default='warn'

pd.set_option('display.max_columns', 500)

train_data = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])

train_data['home_cost_log'] = np.log1p(train_data['home_cost'])

train_na = (train_data.isnull().sum() / len(train_data)) * 100

train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)

f, ax = plt.subplots(figsize=(12, 8))

plt.xticks(rotation='90')

sns.barplot(x=train_na.index, y=train_na)

ax.set(title='Percent missing data by feature', ylabel='% missing')

train_data.loc[train_data['state'] == 33, 'state'] = train_data['state'].mode().iloc[0]

train_data.loc[train_data['build_year'] == 20052009, 'build_year'] = 2007

internal_chars = ['allsq', 'lifesq', 'floor', 'max_floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'home_cost']

corrmat = train_data[internal_chars].corr()

f, ax = plt.subplots(figsize=(10, 7))

plt.xticks(rotation='90')

sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)

import datetime

import matplotlib.dates as mdates

years = mdates.YearLocator() # every year

yearsFmt = mdates.DateFormatter('%Y')

ts_vc = train_data['timestamp'].value_cnts()

f, ax = plt.subplots(figsize=(12, 6))

plt.bar(left=ts_vc.index, height=ts_vc)

ax.xaxis.set_major_locator(years)

ax.xaxis.set_major_formatter(yearsFmt)

ax.set(title='Sales volume over time', ylabel='Number of transactions')

train_data['area_dist'] = train_data['area_m'] / 1000000

train_data['density'] = train_data['raion_popul'] / train_data['area_dist']

f, ax = plt.subplots(figsize=(10, 6))

sa_cost = train_data.groupby('sub')[['density', 'home_cost']].median()

sns.regplot(x="density", y="home_cost", data=sa_cost, scatter=True, truncate=True)

ax.set(title='Median home cost by raion population density (people per sq. km)')

train_data['job_share'] = train_data['job_all'] / train_data['raion_popul']

f, ax = plt.subplots(figsize=(12, 6))

sa_cost = train_data.groupby('sub')[['job_share', 'home_cost']].mean()

sns.regplot(x="job_share", y="home_cost", data=sa_cost, scatter=True, order=4, truncate=True)

ax.set(title='District mean home cost by share of jobing age population')

f, ax = plt.subplots(figsize=(12, 8))

sns.stripplot(x="university_top_20_raion", y="home_cost", data=train_data, jitter=True, alpha=.2, color=".8");

sns.boxplot(x="university_top_20_raion", y="home_cost", data=train_data)

ax.set(title='Distribution of home cost by # of top universities in Raion', xlabel='university_top_20_raion',

ylabel='home_cost')

f, ax = plt.subplots(figsize=(10, 6))

sns.regplot(x="kremlin_dist", y="home_cost", data=train_data, scatter=True, truncate=True, scatter_kws={'color': 'r', 'alpha':.2})

ax.set(title='Home cost by distance to Kremlin')

test_data = pd.read_csv("../input/test.csv", parse_dates=['timestamp'])

test_na = (test_data.isnull().sum() / len(test_data)) * 100

test_na = test_na.drop(test_na[test_na == 0].index).sort_values(ascending=False)

all_data = pd.concat([train_data.drop('home_cost', axis=1), test_data])

all_data['dataset'] = ''

l = len(train_data)

all_data.iloc[:l]['dataset'] = 'train'

all_data.iloc[l:]['dataset'] = 'test'

train_dataset = all_data['dataset'] == 'train'

years = mdates.YearLocator()

yearsFmt = mdates.DateFormatter('%Y')

ts_vc_train = train_data['timestamp'].value_cnts()

ts_vc_test = test_data['timestamp'].value_cnts()

f, ax = plt.subplots(figsize=(12, 6))

plt.bar(left=ts_vc_train.index, height=ts_vc_train)

plt.bar(left=ts_vc_test.index, height=ts_vc_test)

ax.xaxis.set_major_locator(years)

ax.xaxis.set_major_formatter(yearsFmt)

ax.set(title='Num truns day', ylabel='cnt')

from sklearn.metrics import make_scorer

def rmsle_exp(y_true_log, y_pred_log):

y_true = np.exp(y_true_log)

y_pred = np.exp(y_pred_log)

return np.sqrt(np.mean(np.power(np.log(y_true + 1) - np.log(y_pred + 1), 2)))

def score_model(model, pipe):

train_error = rmsle_exp(y_train, model.predict(pipe.transform(X_train)))

test_error = rmsle_exp(y_test, model.predict(pipe.transform(X_test)))

return train_error, test_error

from sklearn.linear_model import LinearRegression

lr = LinearRegression(fit_intercept=True)

lr.fit(pipe.transform(X_train), y_train)

print("Train error: {:.4f}, Test error: {:.4f}".format(*score_model(lr, pipe)))

from sklearn.svm import SVR

svr = SVR()

svr.fit(pipe.transform(X_train), y_train)

print("Train error: {:.4f}, Test error: {:.4f}".format(*score_model(svr, pipe)))

from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators=100, min_samples_leaf=50, n_jobs=-1)

rfr.fit(pipe.transform(X_train), y_train)

print("Train error: {:.4f}, Test error: {:.4f}".format(*score_model(rfr, pipe)))

from xgboost import XGBRegressor

xgb = XGBRegressor()

xgb.fit(pipe.transform(X_train), y_train)

print("Train error: {:.4f}, Test error: {:.4f}".format(*score_model(xgb, pipe)))

pipe.fit(X)

xgb.fit(pipe.transform(X), y)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,

learning_rate=0.1, max_delta_step=0, max_depth=3,

min_child_weight=1, missing=None, n_estimators=100, nthread=-1,

objective='reg:linear', reg_alpha=0, reg_lambda=1,

scale_pos_weight=1, seed=0, silent=True, subsample=1)

test_data = pd.merge(test, macro, how='left', on='timestamp')

test_data['timestamp'] = pd.to_numeric(pd.to_datetime(test_data['timestamp'])) / 1e18

test_data = pd.get_dummies(test_data).astype(np.float64)

df_test = pd.DataFrame(columns=df.columns)

for column in df_test.columns:

if column in test_data.columns:

df_test[column] = test_data[column]

else:

df_test[column] = np.nan

predict_data= pd.DataFrame()

predict_data['id'] = test['id']

predict_data['home_cost'] = pred

predict_data.head()

predict_data.to_csv('pred.csv', index=False)