import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('breast_cancer.csv')
X = df.drop(['diagnosis'], axis=1).to_numpy()
# B is benign and is encoded as 1, M is maligant and is encoded as 0
y = df['diagnosis'].apply(lambda x: 0 if x == 'M' else 1).to_numpy()
df.describe()
radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | ... | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 |
mean | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | 0.062798 | ... | 16.269190 | 25.677223 | 107.261213 | 880.583128 | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 |
std | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | 0.007060 | ... | 4.833242 | 6.146258 | 33.602542 | 569.356993 | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 |
min | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | 0.049960 | ... | 7.930000 | 12.020000 | 50.410000 | 185.200000 | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 |
25% | 11.700000 | 16.170000 | 75.170000 | 420.300000 | 0.086370 | 0.064920 | 0.029560 | 0.020310 | 0.161900 | 0.057700 | ... | 13.010000 | 21.080000 | 84.110000 | 515.300000 | 0.116600 | 0.147200 | 0.114500 | 0.064930 | 0.250400 | 0.071460 |
50% | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 0.092630 | 0.061540 | 0.033500 | 0.179200 | 0.061540 | ... | 14.970000 | 25.410000 | 97.660000 | 686.500000 | 0.131300 | 0.211900 | 0.226700 | 0.099930 | 0.282200 | 0.080040 |
75% | 15.780000 | 21.800000 | 104.100000 | 782.700000 | 0.105300 | 0.130400 | 0.130700 | 0.074000 | 0.195700 | 0.066120 | ... | 18.790000 | 29.720000 | 125.400000 | 1084.000000 | 0.146000 | 0.339100 | 0.382900 | 0.161400 | 0.317900 | 0.092080 |
max | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | 0.097440 | ... | 36.040000 | 49.540000 | 251.200000 | 4254.000000 | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 |
8 rows × 30 columns
sns.countplot(x='diagnosis',data=df, palette='RdBu_r')
<AxesSubplot:xlabel='diagnosis', ylabel='count'>
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=10)
model.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=10)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
predictions = model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))
[[38 4] [ 2 70]] precision recall f1-score support 0 0.95 0.90 0.93 42 1 0.95 0.97 0.96 72 accuracy 0.95 114 macro avg 0.95 0.94 0.94 114 weighted avg 0.95 0.95 0.95 114 0.9473684210526315
from DT_orig import DecisionTree
model = DecisionTree(max_depth=10)
model.fit(X_train, y_train)
Done fitting
from DT_orig import accuracy_score
predictions = model.predict(X_test)
print(accuracy_score(y_test, predictions))
0.9385964912280702
X = df.drop(['diagnosis'], axis=1).to_numpy()
#y = df['diagnosis'].apply(lambda x: 0 if x == 'M' else 1).to_numpy()
y = df['diagnosis'].to_numpy()
y[:10]
array(['M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M'], dtype=object)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# model.fit(X_train, y_train)
# print(accuracy_score(y_test, predictions))
from DT_Humaira_Qadeer import DecisionTreeModel
# replace the above with your version
model = DecisionTreeModel(max_depth=10)
X = df.drop(['diagnosis'], axis=1)
y = df['diagnosis'].apply(lambda x: 0 if x == 'B' else 1)
# make sure your model will work with y being a categorcal variable as well
#y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train.head()
radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
408 | 17.99 | 20.66 | 117.80 | 991.7 | 0.10360 | 0.13040 | 0.120100 | 0.088240 | 0.1992 | 0.06069 | ... | 21.080 | 25.41 | 138.10 | 1349.0 | 0.14820 | 0.37350 | 0.33010 | 0.19740 | 0.3060 | 0.08503 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.198000 | 0.104300 | 0.1809 | 0.05883 | ... | 22.540 | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.40000 | 0.16250 | 0.2364 | 0.07678 |
307 | 9.00 | 14.40 | 56.36 | 246.3 | 0.07005 | 0.03116 | 0.003681 | 0.003472 | 0.1788 | 0.06833 | ... | 9.699 | 20.07 | 60.90 | 285.5 | 0.09861 | 0.05232 | 0.01472 | 0.01389 | 0.2991 | 0.07804 |
386 | 12.21 | 14.09 | 78.78 | 462.0 | 0.08108 | 0.07823 | 0.068390 | 0.025340 | 0.1646 | 0.06154 | ... | 13.130 | 19.29 | 87.65 | 529.9 | 0.10260 | 0.24310 | 0.30760 | 0.09140 | 0.2677 | 0.08824 |
404 | 12.34 | 14.95 | 78.29 | 469.1 | 0.08682 | 0.04571 | 0.021090 | 0.020540 | 0.1571 | 0.05708 | ... | 13.180 | 16.85 | 84.11 | 533.1 | 0.10480 | 0.06744 | 0.04921 | 0.04793 | 0.2298 | 0.05974 |
5 rows × 30 columns
y_train.head()
408 1 4 1 307 0 386 0 404 0 Name: diagnosis, dtype: int64
type(X_train)
pandas.core.frame.DataFrame
model.fit(X_train, y_train)
Done fitting
from DT_Humaira_Qadeer import classification_report,confusion_matrix,accuracy_score
predictions = model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))
[[36 4] [ 6 68]] Precision 0.9 Recall 0.34615384615384615, F1-Score 0.5 [0.9 0.34615385 0.5 ] 0.9122807017543859
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print(classification_report(y_test, rfc_pred))
print(accuracy_score(y_test, rfc_pred))
precision recall f1-score support 0 0.93 0.99 0.96 72 1 0.97 0.88 0.93 42 accuracy 0.95 114 macro avg 0.95 0.93 0.94 114 weighted avg 0.95 0.95 0.95 114 0.9473684210526315
# Type your code here
from DT_Humaira_Qadeer import RandomForestModel
from DT_Humaira_Qadeer import classification_report,confusion_matrix,accuracy_score
rfc = RandomForestModel(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print(classification_report(y_test, rfc_pred))
print(accuracy_score(y_test, rfc_pred))
Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Done fitting Precision 0.9743589743589743 Recall 0.3486238532110092, F1-Score 0.5135135135135136 [0.97435897 0.34862385 0.51351351] 0.956140350877193
# Type your code here