
About dataset: https://en.wikipedia.org/wiki/Iris_flower_data_set
import pandas as pd # import modules pandas as pd
iris_df = pd.read_csv('./dataset/iris/Iris.csv') # load file from csv .extension to be data frame
iris_df.head() # show the first 5 rows of data
iris_df.drop(columns='Id', inplace=True) # delete Columns 'Id'
# True so we can modify current data frame without make another data frame
iris_df.head() # show the first 5 rows of data
iris_df.shape # show the dimension of the dataset with row and column
iris_df.columns # list of columns
iris_df.dtypes # datatype for every column
iris_df.info() # information of the dataset
iris_df.isna().values.any() # detect is there any NULL value in the dataset
iris_df[iris_df.duplicated(keep=False)] # show all rows with duplication
iris_df.duplicated().value_counts() # count the total of duplicated data
iris_df.drop_duplicates(inplace=True) # delete duplicate data
iris_df.shape
iris_df.describe() # data description
iris_df.corr() # correlation between column
import matplotlib.pyplot as plt # import maplotlib as plt for data visualization
import seaborn as sns # import seaborn as sns for data visualization
%matplotlib inline
# output from data visualization data will be directed to notebook
sns.heatmap(data=iris_df.corr()) # visualization using Heatmap
iris_df['Species'].value_counts() # count every species (Iris)
iris_df['Species'].value_counts().plot.bar() # visualization using Bar plot
plt.tight_layout() # maximize visualization with current screen
plt.show()
sns.countplot(data=iris_df, x='Species') # visualization using Bar plot with seaborn (colorized)
plt.tight_layout()
iris_df['Species'].value_counts().plot.pie(autopct='%1.1f%%', labels=None, legend=True) # visualization using pie chart using percentage
plt.tight_layout()
fig,ax = plt.subplots(nrows=2, ncols=2, figsize=(8,8)) # visualization using Line Plot
iris_df['SepalLengthCm'].plot.line(ax=ax[0][0]) # visualize sepal length
ax[0][0].set_title('Sepal Length')
iris_df['SepalWidthCm'].plot.line(ax=ax[0][1]) # visualize sepal width
ax[0][1].set_title('Sepal Width')
iris_df.PetalLengthCm.plot.line(ax=ax[1][0]) # visualize petal length
ax[1][0].set_title('Petal Length')
iris_df.PetalWidthCm.plot.line(ax=ax[1][1]) # visualize petal width
ax[1][1].set_title('Petal Width')
iris_df.plot() # visualization using Line Plot as one plot
plt.tight_layout()
iris_df.hist(figsize=(6,6), bins=10) # visualization using Histogram # bins=10 to expanse it more
plt.tight_layout()
iris_df.boxplot() # visualization using Box Plot (Quarter 1,2,3, Min, Max, Outlier)
plt.tight_layout()
iris_df.boxplot(by="Species", figsize=(8,8)) # visualization using Line Plot based on species
plt.tight_layout()
sns.scatterplot(x='SepalLengthCm', y='SepalWidthCm', data=iris_df, hue='Species') # visualization using Scatter Plot
plt.tight_layout()
sns.pairplot(iris_df, hue='Species', markers='+') # visualization using Pair Plot
plt.tight_layout()
sns.violinplot(data=iris_df, y='Species', x='SepalLengthCm', inner='quartile') # visualization using Violin Plot
plt.tight_layout()
from sklearn.model_selection import train_test_split # as splitter dataset into training and testing set
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report # evaluate model performance
X = iris_df.drop(columns='Species') # put features into variable X
X.head() # show the first 5 rows of X
y = iris_df['Species'] # put class label (target) into variabel y
y.head() # show the first 5 rows of y
# split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=10)
print('training dataset')
print(X_train.shape)
print(y_train.shape)
print()
print('testing dataset:')
print(X_test.shape)
print(y_test.shape)
from sklearn.neighbors import KNeighborsClassifier # using KNN as classifier
k_range = list(range(1,26)) # specify neighbors 1-25, 26 is the max
scores = []
for k in k_range:
model_knn = KNeighborsClassifier(n_neighbors=k) # config algorithm
model_knn.fit(X_train, y_train) # training model/classifier
y_pred = model_knn.predict(X_test) # prediction
scores.append(accuracy_score(y_test, y_pred)) # performance evaluate
plt.plot(k_range, scores) # (x-axis) total neighbor (y-axis) accuracy value
plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors')
plt.tight_layout()
plt.show()
model_knn = KNeighborsClassifier(n_neighbors=3) # config algorithm with 3 neighbors
model_knn.fit(X_train,y_train) # training model/classifier
y_pred = model_knn.predict(X_test) # prediction
print(accuracy_score(y_test, y_pred)) # accuracy evaluate
print(confusion_matrix(y_test, y_pred)) # evaluate confusion matrix
print(classification_report(y_test, y_pred)) # classification evaluate
from sklearn.linear_model import LogisticRegression # import Logistic Regression as classifier
model_logreg = LogisticRegression(solver='lbfgs', multi_class='auto')
model_logreg.fit(X_train,y_train)
y_pred = model_logreg.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
from sklearn.svm import SVC # import SVC as classifier
model_svc = SVC(gamma='scale')
model_svc.fit(X_train,y_train)
y_pred = model_svc.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
from sklearn.tree import DecisionTreeClassifier # import Decision Tree Classifier as classifier
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train,y_train)
y_pred = model_dt.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
from sklearn.ensemble import RandomForestClassifier # import Random Forest Classifier as classifier
model_rf = RandomForestClassifier(n_estimators=100) # (100 means 100 trees)
model_rf.fit(X_train,y_train)
pred_rf = model_rf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
models = [model_knn, model_logreg, model_svc, model_dt, model_rf] # compare model that already created
accuracy_scores = []
for model in models:
y_pred = model.predict(X_test) # make prediction
accuracy = accuracy_score(y_test, y_pred) # accuracy score
accuracy_scores.append(accuracy)
print(accuracy_scores)
plt.bar(['KNN', 'LogReg', 'SVC', 'DT', 'RF'],accuracy_scores) # visualize 5 algorithm performance
plt.ylim(0.90,1.01)
plt.title('Accuracy comparison For Various Models', fontsize=15, color='r')
plt.xlabel('Models', fontsize=18, color='g')
plt.ylabel('Accuracy Score', fontsize=18, color='g')
plt.tight_layout()
plt.show()