About dataset: Melbourne Housing Market
import pandas as pd # import modules pandas as pd
housing_df = pd.read_csv('./dataset/melb_data.csv') # load file from csv .extension to be data frame
housing_df.head() # show the first 5 rows of data
housing_df.shape # show the dimension of the dataset with row and column
housing_df.columns # list of columns
housing_df.dtypes # datatype for every column
housing_df.info() # information of the dataset
housing_df.isna().values.any() # detect is there any NULL value in the dataset
housing_df[housing_df.duplicated(keep=False)] # show all rows with duplication
housing_df.duplicated().value_counts() # count the total of duplicated data
housing_df.describe() # data description
housing_df.corr() # correlation between column
import matplotlib.pyplot as plt # import maplotlib as plt for data visualization
import seaborn as sns # import seaborn as sns for data visualization
%matplotlib inline
# output from data visualization data will be directed to notebook
sns.heatmap(data=housing_df.corr()) # visualization using Heatmap
housing_df['Rooms'].value_counts() # count every Suburb
sns.countplot(data=housing_df, x='Rooms') # visualization using Bar plot with seaborn (colorized)
plt.tight_layout()
housing_df['Rooms'].value_counts().plot.pie(autopct='%1.1f%%', labels=None, legend=True) # visualization using pie chart using percentage
plt.tight_layout()
sns.scatterplot(x='Bedroom2', y='Car', data=housing_df, hue='Rooms') # visualization using Scatter Plot
plt.tight_layout()
housing_df.describe().loc['max', 'Landsize'] # find in row first then column
housing_df = housing_df.dropna() # drop NA / NULL
housing_df.shape
y = housing_df['Price'] # choose Price feature as prediction target then show it
y
y.describe()
features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude'] # feature that we use to predict target
X = housing_df[features] # is Rooms, Bathroom, Landsize, Lattitude and Longitude
X
X.describe() # describing features
from sklearn.tree import DecisionTreeRegressor # import Decision Tree Regressor as Regressor
housing_model = DecisionTreeRegressor(random_state=1)
housing_model.fit(X, y)
housing_model.predict(X.head()) # predict and show top 5 prediction based on features
y.head() # compare with original value
from sklearn.metrics import mean_absolute_error # Evaluation Metric with MAE
y_hat = housing_model.predict(X) # predict house price from some features inside X variable
mean_absolute_error(y, y_hat) # compare original value with prediction value
from sklearn.model_selection import train_test_split # as splitter dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
housing_model = DecisionTreeRegressor(random_state=1)
housing_model.fit(X_train, y_train)
y_hat = housing_model.predict(X_test) # predict in X_test variable
mean_absolute_error(y_test, y_hat) # evaluate y_test with y_hat
def get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test):
model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
model.fit(X_train, y_train)
y_hat = model.predict(X_test)
mae = mean_absolute_error(y_test, y_hat)
return mae
for max_leaf_nodes in [5, 50, 500, 5000]:
leaf_mae = get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test)
print(f'Max leaf nodes: {max_leaf_nodes} \t Mean Absolute Error: {int(leaf_mae)}')
from sklearn.ensemble import RandomForestRegressor # import Random Forest Regressor as Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=1) # n_estimators=100 means using 100 Decision Tree
rf_model.fit(X_train, y_train) # train data
y_hat = rf_model.predict(X_test) # predict data
print(f'Mean Absolute Error: {int(mean_absolute_error(y_test, y_hat))}') # MAE betweeen y_hat & y_test