# Khai báo thư viện
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Cấu hình hiển thị biểu đồ
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Tải dữ liệu từ GitHub
url = "https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv"
df = pd.read_csv(url)

print("🔹 5 dòng đầu của dữ liệu:")
display(df.head())

print("\n🔹 Thông tin tổng quan dữ liệu:")
df.info()

🔹 5 dòng đầu của dữ liệu:

🔹 Thông tin tổng quan dữ liệu:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB

# Tính ma trận tương quan
correlation = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Ma trận tương quan giữa các biến", fontsize=14)
plt.show()

# Đếm số lượng từng nhãn
outcome_counts = df['Outcome'].value_counts()

plt.figure(figsize=(6, 6))
plt.pie(
    outcome_counts,
    labels=['Không bệnh (0)', 'Có bệnh (1)'],
    autopct='%1.1f%%',
    startangle=90
)
plt.title("Tỉ lệ phân bố bệnh nhân")
plt.show()

# 1. Tách đặc trưng (X) và nhãn (y)
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# 2. Chia tập dữ liệu
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Huấn luyện mô hình
# max_depth=3 để cây không quá phức tạp, dễ nhìn
dt_model = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
dt_model.fit(X_train, y_train)

print("Đã huấn luyện xong mô hình Decision Tree!")

Đã huấn luyện xong mô hình Decision Tree!

# Dự đoán
y_pred_dt = dt_model.predict(X_test)

# Đánh giá
rp_dt = classification_report(y_test, y_pred_dt, output_dict=True)
print(classification_report(y_test, y_pred_dt))
print(f"Độ chính xác của Decision Tree: {rp_dt['accuracy']*100:.2f}%")

              precision    recall  f1-score   support

           0       0.80      0.84      0.82        99
           1       0.68      0.62      0.65        55

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.73       154
weighted avg       0.76      0.76      0.76       154

Độ chính xác của Decision Tree: 75.97%

# Vẽ Confusion Matrix
plt.figure(figsize=(6,5))
sns.heatmap(confusion_matrix(y_test, y_pred_dt), annot=True, fmt='d', cmap='Blues')
plt.xlabel('Dự đoán')
plt.ylabel('Thực tế')
plt.title('Confusion Matrix - Decision Tree')
plt.show()

print()

# Vẽ cây quyết định
plt.figure(figsize=(20,10))
plot_tree(dt_model, filled=True, feature_names=X.columns, class_names=['No', 'Yes'], rounded=True)
plt.title("Mô hình Cây Quyết Định (Decision Tree Visualization)")
plt.show()

# Huấn luyện Random Forest
rf_model = RandomForestClassifier(n_estimators=100,random_state=42)
rf_model.fit(X_train, y_train)

# Dự đoán
y_pred_rf = rf_model.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

# Lấy acc_dt từ rp_dt
acc_dt = rp_dt['accuracy']

print(f"Độ chính xác của Decision Tree: {acc_dt*100:.2f}%")
print(f"Độ chính xác của Random Forest: {acc_rf*100:.2f}%")

if acc_rf > acc_dt:
    print("=> Kết luận: Random Forest cho kết quả tốt hơn!")
else:
    print("=> Kết luận: Hai mô hình tương đương hoặc cần tinh chỉnh thêm.")

Độ chính xác của Decision Tree: 75.97%
Độ chính xác của Random Forest: 72.08%
=> Kết luận: Hai mô hình tương đương hoặc cần tinh chỉnh thêm.

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

%%shell
jupyter nbconvert '/content/drive/MyDrive/Colab Notebooks/bao_cao_bai_tap_lon_python.ipynb' --to html

[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/bao_cao_bai_tap_lon_python.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 4 image(s).
[NbConvertApp] Writing 749219 bytes to /content/drive/MyDrive/Colab Notebooks/bao_cao_bai_tap_lon_python.html

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

Thuộc tính	Ý nghĩa
Pregnancies	Số lần mang thai
Glucose	Nồng độ glucose trong máu
BloodPressure	Huyết áp tâm trương
SkinThickness	Độ dày nếp gấp da
Insulin	Nồng độ insulin
BMI	Chỉ số khối cơ thể
DiabetesPedigreeFunction	Chỉ số di truyền bệnh tiểu đường
Age	Tuổi
Outcome	Nhãn kết quả (0: Không bệnh, 1: Có bệnh)

Ghi chép kỹ thuật

Decision_Tree_Ud_Tieu_Duong (Notebook)

Chương 1: TỔNG QUAN VỀ ĐỀ TÀI VÀ KHAI BÁO THƯ VIỆN¶

1.1. Giới thiệu Machine Learning và bài toán nghiên cứu¶

1.2. Ngôn ngữ lập trình và các thư viện sử dụng¶

1.3. Khai báo thư viện¶

CHƯƠNG 2: DỮ LIỆU VÀ TIỀN XỬ LÝ¶

2.1. Tải và khám phá dữ liệu¶

2.2. Mô tả bộ dữ liệu¶

2.3. Phân tích tương quan giữa các đặc trưng¶

2.4. Phân bố nhãn (Outcome)¶

2.5. Kết luận phân tích dữ liệu¶

2.6 Phân chia dữ liệu huấn luyện và kiểm tra¶

CHƯƠNG 3: XÂY DỰNG MÔ HÌNH DECISION TREE¶

3.1. Cơ sở lý thuyết của Decision Tree¶

3.2. Cấu hình và huấn luyện mô hình¶

Chương 4: TRỰC QUAN VÀ ĐÁNH GIÁ MÔ HÌNH¶

4.1. Đánh giá hiệu quả mô hình¶

4.2. Trực quan hóa kết quả¶

4.3. Đề xuất cải tiến và So sánh mô hình¶

4.3.1. Phân tích kết quả thực nghiệm¶

4.3.2. Đánh giá và Nhận xét:¶

Chương 5: KẾT LUẬN TỔNG KẾT VÀ HƯỚNG PHÁT TRIỂN¶

5.1. Kết quả đạt được¶

5.2. Đánh giá ưu và nhược điểm¶

5.3. Bài học rút ra và Hướng phát triển¶