This project provides a full pipeline for processing YouTube video transcripts, translating them, and building text classification models.
- Download & extract audio from YouTube videos using yt-dlp.
- Transcribe audio with OpenAI's Whisper model.
- Translate transcripts to a target language using Google Translate.
- Clean and preprocess the textual data.
- Train machine learning classifiers for text categorization:
- Bag-of-Words + LinearSVC
- TF-IDF + Multinomial Naive Bayes
- Evaluate models using accuracy, classification reports, and confusion matrices.
- Save models and vectorizers for future inference.
- Backup artifacts to Google Drive for safekeeping.
This notebook is fully Google Colab-compatible and supports storing all outputs (models, plots, CSVs, and README summaries) in a structured folder.
Installs all required Python packages and system dependencies to run the pipeline : yt-dlp, openai-whisper, googletrans==4.0.0-rc1, pandas, numpy, scikit-learn, matplotlib, tqdm, langdetect, torch, torchaudio, torchvision
!pip install -q yt-dlp openai-whisper googletrans==4.0.0-rc1 pandas numpy scikit-learn matplotlib tqdm langdetect
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!cp "/content/www.youtube.com_cookies (1).txt" /content/cookies.txt
!apt-get update -qq && apt-get install -y ffmpegInstall Dependencies & Setup Cookies
!cp "/content/www.youtube.com_cookies.txt" /content/cookies.txtImports necessary libraries and creates folder structure:
import os, sys, random, time, subprocess
from pathlib import Path
from typing import Optional, List
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
try:
import whisper
except Exception:
whisper = None
try:
from googletrans import Translator
except Exception:
Translator = None
try:
from langdetect import detect
except Exception:
detect = None
BASE = Path.cwd()
DATA_DIR = BASE / "data"
AUDIO_DIR = DATA_DIR / "audio"
TRANS_DIR = DATA_DIR / "transcripts"
LOGS_DIR = BASE / "logs"
MODELS_DIR = BASE / "models"
for d in [DATA_DIR, AUDIO_DIR, TRANS_DIR, LOGS_DIR, MODELS_DIR]:
d.mkdir(parents=True, exist_ok=True)Defines pipeline parameters for downloading, transcription, translation, and ML analysis. Parameters can be modified to adjust batch size, target language, or Whisper model
CSV_PATH = "youtube_data.csv" #@param {type:"string"}
YOUTUBE_ID_COL = "video_id" #@param {type:"string"}
CATEGORY_COL = "category" #@param {type:"string"}
N_SAMPLE = 100 #@param {type:"number"}
BATCH_SIZE = 20 #@param {type:"number"}
SHUFFLE_IDS = True #@param {type:"boolean"}
USE_COOKIES_FROM_BROWSER = False #@param {type:"boolean"}
COOKIES_TXT = "/content/cookies.txt" #@param {type:"string"}
AUDIO_EXT = "m4a" #@param ["m4a","mp3","wav","opus"]
MIN_SLEEP, MAX_SLEEP = 1, 3 #@param {type:"raw"}
CONCURRENT_FRAGMENTS = 1 #@param {type:"number"}
LIMIT_RATE = "2M" #@param {type:"string"}
FORCE_IPV4 = True #@param {type:"boolean"}
REGION = "" #@param {type:"string"}
USE_ARCHIVE = True #@param {type:"boolean"}
ARCHIVE_PATH = str(DATA_DIR / "downloaded.txt")
WHISPER_MODEL = "large-v3" #@param ["tiny","base","small","medium","large","large-v3"]
WHISPER_LANGUAGE_HINT = "" # leave empty for autodetect
SKIP_DOWNLOAD = False #@param {type:"boolean"}
SKIP_TRANSCRIBE = False #@param {type:"boolean"}
SKIP_TRANSLATE = False #@param {type:"boolean"}
TARGET_LANG = "en" #@param ["en","th","zh-cn","ja","es","de","fr","ru","ar","hi"]
USE_TFIDF = True #@param {type:"boolean"}
MAX_FEATURES = 20000 #@param {type:"number"}
RANDOM_STATE = 42
WRITE_CHECKPOINT_EVERY = 1 #@param {type:"number"}
CHECKPOINT_PREFIX = "translations_accumulated"Loads the input CSV and extracts video IDs. Handles missing categories, removes duplicates, shuffles IDs if needed, and limits the number of samples to N_SAMPLE
assert Path(CSV_PATH).exists(), f"CSV not found: {CSV_PATH}"
df_meta = pd.read_csv(CSV_PATH)
assert YOUTUBE_ID_COL in df_meta.columns
if not CATEGORY_COL:
for cand in ["category","category_id","categoryId","Category"]:
if cand in df_meta.columns:
CATEGORY_COL = cand; break
if CATEGORY_COL and CATEGORY_COL in df_meta.columns:
print("Using category column:", CATEGORY_COL)
else:
CATEGORY_COL = None
print("WARNING: No category column found; will use 'unknown'.")
df_meta[YOUTUBE_ID_COL] = df_meta[YOUTUBE_ID_COL].astype(str).str.strip()
df_meta = df_meta.dropna(subset=[YOUTUBE_ID_COL]).drop_duplicates(subset=[YOUTUBE_ID_COL]).reset_index(drop=True)
if SHUFFLE_IDS:
df_meta = df_meta.sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)
sampled = df_meta.iloc[:N_SAMPLE].reset_index(drop=True) if len(df_meta)>N_SAMPLE else df_meta.copy()
all_ids = sampled[YOUTUBE_ID_COL].tolist()
print("Prepared IDs:", len(all_ids))Tests cookies for downloading restricted videos with yt-dlp. Warns if cookies are missing or expired.
# Preflight cookie test
import subprocess, time
TEST_ID = all_ids[0] if all_ids else "kudq-eimjtY" # any video ID is fine
cmd = ["yt-dlp", f"https://www.youtube.com/watch?v={TEST_ID}", "-F"]
# Use cookies if present
if COOKIES_TXT and Path(COOKIES_TXT).exists():
cmd += ["--cookies", COOKIES_TXT]
else:
print("⚠️ cookies.txt not found at", COOKIES_TXT, "- continuing unauthenticated")
# Run with a short timeout
res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=60)
tail = "\n".join(res.stdout.splitlines()[-40:])
print(tail)
# Simple verdict
if "Available formats" in res.stdout or "format code" in res.stdout:
print("\n✅ Cookies look valid. You can proceed to run batches.")
else:
print("\n⚠️ Cookies may be invalid/expired (or the test video is blocked).")
print(" • Re-export cookies from your logged-in browser and re-upload to:", COOKIES_TXT)
print(" • Then rerun this preflight test before starting a long run.")Defines:
- build_yt_dlp_cmd() → generates yt-dlp command for downloading audio
- download_audio_for_ids() → downloads audio for a list of video IDs, handles retries and fallback formats
def build_yt_dlp_cmd(video_id: str, out_dir: Path, audio_ext: str) -> list:
url = f"https://www.youtube.com/watch?v={video_id}"
out_tpl = str(out_dir / f"%(id)s.%(ext)s")
cmd = [
"yt-dlp", url, "-f","bestaudio/best","-x","--audio-format",audio_ext,
"--no-continue","--no-overwrites","--ignore-errors","--no-warnings","--no-check-certificates",
"--add-header","User-Agent: Mozilla/5.0","--add-header","Accept-Language: en-US,en;q=0.9",
"--retries","5","--fragment-retries","10","--abort-on-error",
"--min-sleep-interval", str(MIN_SLEEP),"--max-sleep-interval", str(MAX_SLEEP),
"--output", out_tpl,"--geo-bypass",
"--concurrent-fragments", str(CONCURRENT_FRAGMENTS),
"--extractor-args","youtube:player-client=web,webpage"
]
if LIMIT_RATE.strip(): cmd += ["--limit-rate", LIMIT_RATE]
if FORCE_IPV4: cmd += ["--force-ipv4"]
if REGION.strip(): cmd += ["--geo-bypass-country", REGION.strip()]
if USE_ARCHIVE: cmd += ["--download-archive", ARCHIVE_PATH]
if USE_COOKIES_FROM_BROWSER: cmd += ["--cookies-from-browser", BROWSER]
if COOKIES_TXT and Path(COOKIES_TXT).exists(): cmd += ["--cookies", COOKIES_TXT]
return cmd
def download_audio_for_ids(ids: List[str], audio_dir: Path, audio_ext: str):
audio_dir.mkdir(parents=True, exist_ok=True)
downloaded, failed = [], []
for vid in tqdm(ids, desc="Downloading audio"):
if list(audio_dir.glob(f"{vid}.*")): downloaded.append(vid); continue
cmd = build_yt_dlp_cmd(vid, audio_dir, audio_ext)
try:
res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=900)
if res.returncode != 0:
ok = False
for ea in ["android","tv"]:
fcmd = [c for c in cmd]
if "--extractor-args" in fcmd:
i = fcmd.index("--extractor-args"); fcmd[i+1] = f"youtube:player-client={ea}"
else:
fcmd += ["--extractor-args", f"youtube:player-client={ea}"]
res2 = subprocess.run(fcmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=900)
if res2.returncode == 0: ok = True; break
if not ok:
f2 = [c for c in cmd if c != "--abort-on-error"] + ["-f","140/bestaudio"]
res3 = subprocess.run(f2, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=900)
if res3.returncode == 0: ok = True
if ok: downloaded.append(vid)
else: failed.append((vid, res.stdout[-1000:]))
else:
downloaded.append(vid)
except subprocess.TimeoutExpired:
failed.append((vid,"Timeout"))
except Exception as e:
failed.append((vid,str(e)))
time.sleep(random.uniform(1,3))
if failed:
with open(LOGS_DIR / "download_failures.log","a",encoding="utf-8") as f:
for vid, err in failed: f.write(f"{vid}\t{err}\n")
return downloaded, failedDefines:
- load_whisper() → loads Whisper model
- transcribe_file() → transcribes a single audio file
- transcribe_all() → transcribes all audio files in data/audio/ and saves as .txt in data/transcripts/
def load_whisper(model_name:str):
assert whisper is not None, "Install openai-whisper first."
return whisper.load_model(model_name)
def transcribe_file(model, audio_path: Path, language_hint: str = "") -> str:
opts = {}
if language_hint.strip(): opts["language"] = language_hint.strip()
result = model.transcribe(str(audio_path), **opts)
return result.get("text","").strip()
def discover_audio_files():
exts = ("*.m4a","*.mp3","*.wav","*.webm","*.opus")
files = []
for pat in exts: files += list(AUDIO_DIR.glob(pat))
return sorted(files)
def transcribe_all(trans_dir: Path, language_hint:str=""):
trans_dir.mkdir(parents=True, exist_ok=True)
files = discover_audio_files()
if not files: return 0
model = load_whisper(WHISPER_MODEL)
done = 0
for a in tqdm(files, desc="Transcribing"):
out_txt = trans_dir / f"{a.stem}.txt"
if out_txt.exists(): continue
try:
out_txt.write_text(transcribe_file(model, a, language_hint), encoding="utf-8"); done += 1
except Exception as e:
with open(LOGS_DIR / "transcribe_failures.log","a",encoding="utf-8") as f:
f.write(f"{a.stem}\t{repr(e)}\n")
return doneDefines:
- safe_detect() → language detection fallback
- load_translator() → loads Google Translate
- build_rows_from_transcripts() → constructs DataFrame from transcripts
- translate_and_write() → translates text, saves CSVs (translations_accumulated.csv, _2col.csv, _transcripts.csv)
def safe_detect(text:str, default="unknown"):
if not text.strip(): return default
if detect is None: return default
try: return detect(text)
except Exception: return default
def load_translator():
if Translator is None: return None
try: return Translator()
except Exception: return None
def build_rows_from_transcripts(sampled_meta: pd.DataFrame):
txt_map = {p.stem: p for p in TRANS_DIR.glob("*.txt")}
rows = []
for _, row in sampled_meta.iterrows():
vid = str(row[YOUTUBE_ID_COL]).strip()
category = row[CATEGORY_COL] if (CATEGORY_COL and CATEGORY_COL in sampled_meta.columns) else "unknown"
original = txt_map.get(vid).read_text(encoding="utf-8", errors="ignore").strip() if vid in txt_map else ""
rows.append({"youtube_id": vid, "text": original, "category": category})
return pd.DataFrame(rows)
def translate_and_write(df_rows: pd.DataFrame, target_lang: str, skip_translate=False, checkpoint_name="translations_accumulated.csv"):
translator = load_translator()
outs = []
for _, r in tqdm(df_rows.iterrows(), total=len(df_rows), desc="Translating"):
txt = r["text"] or ""
if skip_translate:
t = txt
else:
src = safe_detect(txt, default="unknown")
if src == target_lang or translator is None:
t = txt
else:
try:
t = translator.translate(txt, dest=target_lang).text
except Exception:
t = txt
outs.append({"youtube_id": r["youtube_id"], "text_translated": t, "text": txt, "category": r["category"]})
df_out = pd.DataFrame(outs)
main = DATA_DIR / checkpoint_name
if main.exists():
old = pd.read_csv(main)
combined = pd.concat([old, df_out], ignore_index=True).drop_duplicates(subset=["youtube_id"], keep="last")
combined.to_csv(main, index=False, encoding="utf-8")
else:
df_out.to_csv(main, index=False, encoding="utf-8")
def append_like(name, cols):
p = DATA_DIR / name
sub = df_out[cols]
if p.exists():
old = pd.read_csv(p)
pd.concat([old, sub], ignore_index=True).drop_duplicates(subset=["youtube_id"], keep="last").to_csv(p, index=False, encoding="utf-8")
else:
sub.to_csv(p, index=False, encoding="utf-8")
append_like("translations_accumulated_2col.csv", ["youtube_id","text_translated","category"])
append_like("translations_accumulated_transcripts.csv", ["youtube_id","text","category"])
return df_outDefines process_batches():
- Splits video IDs into batches
- Downloads audio, transcribes, and translates for each batch
- Saves intermediate results and logs failures
- Sleeps randomly between batches to prevent rate limiting
def chunked(lst, n):
for i in range(0, len(lst), n):
yield lst[i:i+n]
def process_batches(ids: List[str], sampled_meta: pd.DataFrame):
for i, batch_ids in enumerate(chunked(ids, BATCH_SIZE), start=1):
print(f"\n=== Batch {i} === ({len(batch_ids)} IDs)")
if not SKIP_DOWNLOAD:
d, f = download_audio_for_ids(batch_ids, AUDIO_DIR, AUDIO_EXT)
print(f"Downloaded: {len(d)} | Failed: {len(f)}")
else:
print("Skipping download.")
if not SKIP_TRANSCRIBE:
new_t = transcribe_all(TRANS_DIR, WHISPER_LANGUAGE_HINT)
print("New transcripts:", new_t)
else:
print("Skipping transcribe.")
meta_batch = sampled_meta[sampled_meta[YOUTUBE_ID_COL].astype(str).isin(batch_ids)]
df_rows = build_rows_from_transcripts(meta_batch)
_ = translate_and_write(df_rows, TARGET_LANG, skip_translate=SKIP_TRANSLATE, checkpoint_name="translations_accumulated.csv")
time.sleep(random.uniform(1,3))
process_batches(all_ids, sampled)- Imports and Setup This cell imports all necessary Python libraries for data handling, visualization, and machine learning. It also sets a fixed random seed for reproducibility.
import os, re, sys, json, math, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels
import joblib
# Reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)This cell loads the translated dataset (generated from previous YouTube translation pipeline) into a pandas DataFrame.
drive_csv_path = "/content/drive/MyDrive/AJ.Mind project/translations_accumulated.csv" # 👈 EDIT if using Drive
# Choose the source:
csv_path = drive_csv_path # or a filename from 'uploaded' dict
# Read CSV
df = pd.read_csv(csv_path)
print("Loaded shape:", df.shape)
df.head(3)This cell automatically detects which columns in the dataset represent text data and labels (categories). It helps standardize datasets that may use different column names.
# 🔎 Infer column names (text & label) if you haven't standardized them
possible_text_cols = ["text_translated", "translated_text", "transcript", "text", "content", "caption", "description"]
possible_label_cols = ["category", "label", "class", "topic"]
def find_col(candidates, columns):
cols_lower = {c.lower(): c for c in columns}
for cand in candidates:
if cand in cols_lower:
return cols_lower[cand]
return None
text_col = find_col(possible_text_cols, df.columns)
label_col = find_col(possible_label_cols, df.columns)
# Heuristics if not found
if text_col is None:
obj_cols = [c for c in df.columns if df[c].dtype == "object"]
if obj_cols:
lengths = {c: df[c].astype(str).str.len().mean() for c in obj_cols}
text_col = max(lengths, key=lengths.get)
else:
raise ValueError("❌ Could not infer a text column. Please add one (e.g., 'text_translated').")
if label_col is None:
candidates = []
for c in df.columns:
if c == text_col:
continue
uniq = df[c].nunique(dropna=True)
if df[c].dtype == "object" and 0 < uniq <= 50:
candidates.append((c, uniq))
if candidates:
label_col = sorted(candidates, key=lambda x: x[1])[0][0]
else:
raise ValueError("❌ Could not infer a label/category column. Please add one (e.g., 'category').")
print(f"Using TEXT column: {text_col}")
print(f"Using LABEL column: {label_col}")This cell loads your main dataset and shows the number of samples per category.
import pandas as pd
df = pd.read_csv("/content/youtube_data.csv")
print(df["category"].value_counts())This cell prepares the dataset for training by cleaning and filtering it.
# Clean data: strip whitespace, drop blanks/nulls/dupes
def is_blank(x):
if pd.isna(x): return True
if isinstance(x, str): return len(x.strip()) == 0
return False
before = len(df)
# Normalize whitespace
df[text_col] = df[text_col].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
df[label_col] = df[label_col].astype(str).str.strip()
# Drop blank/null and duplicates
df = df[~df[text_col].apply(is_blank) & ~df[label_col].apply(is_blank)]
df = df.dropna(subset=[text_col, label_col])
df = df.drop_duplicates(subset=[text_col, label_col])
after_clean = len(df)
print(f"Rows before: {before} -> after clean: {after_clean} (removed {before - after_clean})")
# ❗ Remove labels with < 2 samples (needed for valid stratified split + learnability)
counts = df[label_col].value_counts()
keep_labels = counts[counts >= 2].index
df = df[df[label_col].isin(keep_labels)].copy()
after_filter = len(df)
removed_rare = after_clean - after_filter
print(f"Removed rows due to rare labels (<2 per class): {removed_rare}")
print("Label distribution (post-filter):")
print(df[label_col].value_counts().sort_values(ascending=False).head(20))This cell divides the cleaned dataset into training and testing sets.
# ✂️ Split 80:20 (stratified when possible)
X = df[text_col].astype(str).values
y = df[label_col].astype(str).values
can_stratify = len(np.unique(y)) > 1 and np.min(df[label_col].value_counts()) >= 2
stratify = y if can_stratify else None
if stratify is None:
print("⚠️ Not stratifying (some classes too small).")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.20, random_state=SEED, stratify=stratify
)
print(f"Train size: {len(X_train)} | Test size: {len(X_test)}")This cell builds and trains a simple text classification model using Bag-of-Words features and a Linear Support Vector Classifier (SVC).
# 🧠 Build & train: Bag-of-Words → LinearSVC
# Notes:
# - CountVectorizer = classic Bag-of-Words (unigrams). You can try n-grams by setting ngram_range=(1,2).
# - Avoid language-specific stopwords because your data may be multilingual.
# - class_weight='balanced' helps with class imbalance.
pipeline = Pipeline([
("vect", CountVectorizer(lowercase=True, max_features=50000, ngram_range=(1,1))),
("clf", LinearSVC(class_weight='balanced', max_iter=10000, random_state=SEED))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy (test): {acc:.4f}\n")
print("Classification report:")
print(classification_report(y_test, y_pred, zero_division=0))This cell visualizes how well the model performs for each category.
labels = sorted(unique_labels(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred, labels=labels)
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest')
plt.title('Confusion Matrix')
plt.xticks(ticks=np.arange(len(labels)), labels=labels, rotation=45, ha='right')
plt.yticks(ticks=np.arange(len(labels)), labels=labels)
plt.xlabel('Predicted')
plt.ylabel('True')
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
plt.text(j, i, cm[i, j], ha="center", va="center")
plt.tight_layout()
os.makedirs("/content/artifacts", exist_ok=True)
cm_path = "/content/artifacts/confusion_matrix.png"
plt.savefig(cm_path, bbox_inches="tight")
plt.show()
print("Saved:", cm_path)This cell saves the trained components and prediction samples for reuse.
# Save vectorizer & model, and sample predictions
vec = pipeline.named_steps["vect"]
clf = pipeline.named_steps["clf"]
vec_path = "/content/artifacts/bow_vectorizer.joblib"
clf_path = "/content/artifacts/bow_linearsvc_model.joblib"
joblib.dump(vec, vec_path)
joblib.dump(clf, clf_path)
# Sample predictions CSV
pred_df = pd.DataFrame({
"text": X_test,
"true_label": y_test,
"pred_label": y_pred
})
pred_csv = "/content/artifacts/sample_predictions.csv"
pred_df.to_csv(pred_csv, index=False)
# README
readme = f"""
Bag-of-Words Text Classifier (LinearSVC)
Columns used:
- TEXT: {text_col}
- LABEL: {label_col}
Rows:
- After cleaning & de-dup: {after_clean}
- Removed (labels with <2 samples): {removed_rare}
- Used for training: {after_filter}
Split: 80% train / 20% test
Accuracy: {acc:.4f}
Quick reuse:
import joblib
vec = joblib.load("{vec_path}")
clf = joblib.load("{clf_path}")
X = vec.transform(["example sentence here"])
print(clf.predict(X))
"""
with open("/content/artifacts/README.txt", "w", encoding="utf-8") as f:
f.write(readme)
print("Saved artifacts:")
print(vec_path)
print(clf_path)
print(cm_path)
print(pred_csv)
print("/content/artifacts/README.txt")This cell copies all trained models, vectorizers, and output files to Google Drive for safekeeping.
#Copy artifacts to Google Drive for safekeeping
# This will create /MyDrive/Colab_BOW_Artifacts and copy files there.
target_dir = "/content/drive/MyDrive/AJ.Mind project/Colab_BOW_Artifacts"
os.makedirs(target_dir, exist_ok=True)
for fname in ["bow_vectorizer.joblib", "bow_linearsvc_model.joblib",
"confusion_matrix.png", "sample_predictions.csv", "README.txt"]:
src = f"/content/artifacts/{fname}"
dst = f"{target_dir}/{fname}"
if os.path.exists(src):
!cp "{src}" "{dst}"
print("Copied to Drive:", dst)
else:
print("Not found (skipped):", src)This cell trains a Multinomial Naive Bayes model using TF-IDF features.
# Train Naive Bayes
# You can switch between CountVectorizer (BoW) or TfidfVectorizer (weighted BoW)
vectorizer = TfidfVectorizer(lowercase=True, max_features=100000, ngram_range=(1,2), min_df=2)
pipeline = Pipeline([
("vect", vectorizer),
("clf", MultinomialNB())
])
pipeline.fit(X_train, y_train)
# Evaluate
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {acc:.4f}\n")
print(classification_report(y_test, y_pred, zero_division=0))This cell plots the confusion matrix for the Naive Bayes model.
# Confusion matrix
labels = sorted(unique_labels(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred, labels=labels)
plt.figure(figsize=(8,6))
plt.imshow(cm, interpolation='nearest')
plt.title("Confusion Matrix - Naive Bayes")
plt.xticks(np.arange(len(labels)), labels, rotation=45, ha='right')
plt.yticks(np.arange(len(labels)), labels)
plt.xlabel("Predicted")
plt.ylabel("True")
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
plt.text(j, i, cm[i, j], ha='center', va='center')
plt.tight_layout()
os.makedirs("/content/artifacts", exist_ok=True)
cm_path = "/content/artifacts/confusion_matrix_nb.png"
plt.savefig(cm_path, bbox_inches='tight')
plt.show()This cell saves the trained TF-IDF vectorizer and Naive Bayes model for later use.
# Save model & vectorizer
vec_path = "/content/artifacts/nb_vectorizer.joblib"
model_path = "/content/artifacts/nb_model.joblib"
joblib.dump(pipeline.named_steps["vect"], vec_path)
joblib.dump(pipeline.named_steps["clf"], model_path)
print("Artifacts saved to /content/artifacts/")This cell copies all trained Naive Bayes files and outputs to Google Drive for safekeeping
#📤 Copy artifacts to Google Drive for safekeeping
# This will create /MyDrive/Colab_BOW_Artifacts and copy files there.
target_dir = "/content/drive/MyDrive/AJ.Mind project/Colab_BOW_Artifacts"
os.makedirs(target_dir, exist_ok=True)
for fname in ["nb_vectorizer.joblib", "nb_model.joblib",
"confusion_matrix_nb.png", "sample_predictions.csv", "README.txt"]:
src = f"/content/artifacts/{fname}"
dst = f"{target_dir}/{fname}"
if os.path.exists(src):
!cp "{src}" "{dst}"
print("Copied to Drive:", dst)
else:
print("Not found (skipped):", src)- Thanapat Iampramool 6713367
- Boonnada Onsamrith 67713372
- Paiya Boonyamin 6713389
- Pornlapas Kulthakerng 6713381
- Emtanan Malarat 6713397