YouTube Search Capture Pipeline (Python → SQLite)

This document outlines a Python-based process to persist youtube.com search queries and associated metadata into a local SQLite database at:
C:\Temp\projectYouTubeAnalytics\DB\searches.db

1. High-Level Architecture

2. Target Database Location & Initialization

The script ensures the directory and database exist:

ROOT = r"C:\Temp\projectYouTubeAnalytics\DB"
DB_PATH = ROOT + r"\searches.db"

On first run, the script:

3. Proposed SQLite Schema

3.1. Table: search_runs

CREATE TABLE IF NOT EXISTS search_runs (
    id            INTEGER PRIMARY KEY AUTOINCREMENT,
    started_at    TEXT NOT NULL,
    finished_at   TEXT,
    source        TEXT NOT NULL,   -- e.g., 'api', 'browser_log'
    query_count   INTEGER DEFAULT 0
);

3.2. Table: search_queries

CREATE TABLE IF NOT EXISTS search_queries (
    id            INTEGER PRIMARY KEY AUTOINCREMENT,
    run_id        INTEGER NOT NULL REFERENCES search_runs(id),
    query_text    TEXT NOT NULL,
    executed_at   TEXT NOT NULL,
    result_count  INTEGER,
    raw_payload   TEXT,            -- JSON from API or scraped HTML fragment
    UNIQUE(run_id, query_text, executed_at)
);

3.3. Table: search_results (optional, per-video granularity)

CREATE TABLE IF NOT EXISTS search_results (
    id            INTEGER PRIMARY KEY AUTOINCREMENT,
    query_id      INTEGER NOT NULL REFERENCES search_queries(id),
    video_id      TEXT NOT NULL,
    title         TEXT,
    channel       TEXT,
    published_at  TEXT,
    position      INTEGER,         -- rank in search results
    raw_payload   TEXT
);

4. Python Script Workflow

4.1. Dependencies

4.2. Core Steps

  1. Resolve paths and ensure C:\Temp\projectYouTubeAnalytics\DB exists.
  2. Open SQLite connection to searches.db.
  3. Run schema migration (create tables if not present).
  4. Insert a new row into search_runs (start timestamp, source).
  5. For each search term:
    • Call YouTube (API or local log parser) to execute or retrieve the search.
    • Persist the query into search_queries.
    • Optionally, persist per-video results into search_results.
  6. Update search_runs.query_count and finished_at.
  7. Commit and close the connection.

5. Skeleton Python Script (Outline)

import os
import sqlite3
from datetime import datetime

ROOT = r"C:\Temp\projectYouTubeAnalytics\DB"
DB_PATH = os.path.join(ROOT, "searches.db")


def ensure_db():
    os.makedirs(ROOT, exist_ok=True)
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()

    cur.executescript(
        """
        CREATE TABLE IF NOT EXISTS search_runs (
            id          INTEGER PRIMARY KEY AUTOINCREMENT,
            started_at  TEXT NOT NULL,
            finished_at TEXT,
            source      TEXT NOT NULL,
            query_count INTEGER DEFAULT 0
        );

        CREATE TABLE IF NOT EXISTS search_queries (
            id            INTEGER PRIMARY KEY AUTOINCREMENT,
            run_id        INTEGER NOT NULL REFERENCES search_runs(id),
            query_text    TEXT NOT NULL,
            executed_at   TEXT NOT NULL,
            result_count  INTEGER,
            raw_payload   TEXT,
            UNIQUE(run_id, query_text, executed_at)
        );
        """
    )

    conn.commit()
    return conn


def start_run(conn, source: str) -> int:
    cur = conn.cursor()
    now = datetime.utcnow().isoformat()
    cur.execute(
        "INSERT INTO search_runs (started_at, source) VALUES (?, ?)",
        (now, source),
    )
    conn.commit()
    return cur.lastrowid


def record_query(conn, run_id: int, query: str, result_count: int | None, raw_payload: str | None):
    cur = conn.cursor()
    ts = datetime.utcnow().isoformat()
    cur.execute(
        """
        INSERT OR IGNORE INTO search_queries
            (run_id, query_text, executed_at, result_count, raw_payload)
        VALUES (?, ?, ?, ?, ?)
        """,
        (run_id, query, ts, result_count, raw_payload),
    )
    conn.commit()


def finalize_run(conn, run_id: int):
    cur = conn.cursor()
    cur.execute(
        "SELECT COUNT(*) FROM search_queries WHERE run_id = ?",
        (run_id,),
    )
    (count,) = cur.fetchone()
    now = datetime.utcnow().isoformat()
    cur.execute(
        "UPDATE search_runs SET finished_at = ?, query_count = ? WHERE id = ?",
        (now, count, run_id),
    )
    conn.commit()


def main():
    conn = ensure_db()
    run_id = start_run(conn, source="api")

    # Example: static list; in practice, drive this from config, CLI, or logs
    queries = ["mit ai lab", "distributed systems lecture", "python asyncio tutorial"]

    for q in queries:
        # TODO: integrate YouTube Data API or local capture here
        # result_count, raw_payload = fetch_youtube_search(q)
        result_count, raw_payload = None, None
        record_query(conn, run_id, q, result_count, raw_payload)

    finalize_run(conn, run_id)
    conn.close()


if __name__ == "__main__":
    main()

6. Integration Points

7. Extensions