PST to JSON + Vector DB on Windows and Ubuntu

This guide shows exactly how to:

Open and extract emails from a .pst file
Convert emails into line-delimited JSON (.jsonl)
Optionally enrich each email with Qwen/Gemma
Create a local vector database (Chroma) from embeddings
Search your email archive semantically

Architecture (same for both OSes)

PST file
  -> readpst (extract .eml)
  -> Python parser (JSONL)
  -> optional Qwen/Gemma extraction
  -> embedding model (nomic-embed-text)
  -> Chroma vector DB
  -> semantic search / RAG

Prerequisites

PST file available locally
Python 3.10+ installed
Disk space: at least 3x PST size (extraction + JSON + vectors)
Ollama installed for local models (recommended)

Privacy: PST can include sensitive student/HR/legal/medical info. Keep this workflow local unless you explicitly approve cloud processing.

Part A — Windows setup (recommended via WSL)

1) Install WSL + Ubuntu

PowerShell (Admin):
wsl --install

Reboot if prompted, then open Ubuntu from Start menu.

2) Install extraction + Python dependencies in Ubuntu

sudo apt update
sudo apt install -y pst-utils python3-pip python3-venv
mkdir -p ~/pst_project
cd ~/pst_project
python3 -m venv .venv
source .venv/bin/activate
pip install --upgrade pip
pip install chromadb requests beautifulsoup4

3) Install Ollama on Windows

Download: https://ollama.com/download/windows

Then in PowerShell:

ollama pull qwen2.5:7b
ollama pull gemma2:9b
ollama pull nomic-embed-text

4) Extract PST to EML files

Example PST path from Windows into WSL:

cd ~/pst_project
mkdir -p extracted_mail
readpst -r -u -o extracted_mail /mnt/c/Users/YOUR_USERNAME/Documents/archive.pst

Success check: You should see nested folders and many .eml files under ~/pst_project/extracted_mail.

Part B — Ubuntu native setup

If your machine is already Ubuntu, use this directly:

sudo apt update
sudo apt install -y pst-utils python3-pip python3-venv
mkdir -p ~/pst_project && cd ~/pst_project
python3 -m venv .venv
source .venv/bin/activate
pip install --upgrade pip
pip install chromadb requests beautifulsoup4
mkdir -p extracted_mail
readpst -r -u -o extracted_mail /path/to/archive.pst

Part C — Convert EML to JSONL

Create parse_eml_to_jsonl.py in ~/pst_project:

from pathlib import Path
from email import policy
from email.parser import BytesParser
from bs4 import BeautifulSoup
import json

SRC = Path("extracted_mail")
OUT = Path("emails.jsonl")


def html_to_text(html: str) -> str:
    return BeautifulSoup(html, "html.parser").get_text("\n", strip=True)


def body_from_msg(msg):
    text_parts, html_parts = [], []
    if msg.is_multipart():
        for part in msg.walk():
            ctype = part.get_content_type()
            disp = part.get_content_disposition()
            if disp == "attachment":
                continue
            try:
                content = part.get_content()
            except Exception:
                continue
            if ctype == "text/plain":
                text_parts.append(content)
            elif ctype == "text/html":
                html_parts.append(content)
    else:
        try:
            content = msg.get_content()
            if msg.get_content_type() == "text/plain":
                text_parts.append(content)
            elif msg.get_content_type() == "text/html":
                html_parts.append(content)
        except Exception:
            pass

    if text_parts:
        return "\n".join(text_parts).strip()
    if html_parts:
        return "\n".join(html_to_text(h) for h in html_parts).strip()
    return ""


with OUT.open("w", encoding="utf-8") as out:
    i = 0
    for p in SRC.rglob("*.eml"):
        try:
            with p.open("rb") as f:
                msg = BytesParser(policy=policy.default).parse(f)
            rec = {
                "id": i,
                "file": str(p),
                "subject": msg.get("subject", ""),
                "from": msg.get("from", ""),
                "to": msg.get("to", ""),
                "cc": msg.get("cc", ""),
                "date": msg.get("date", ""),
                "message_id": msg.get("message-id", ""),
                "body": body_from_msg(msg)
            }
            out.write(json.dumps(rec, ensure_ascii=False) + "\n")
            i += 1
        except Exception as e:
            print(f"skip {p}: {e}")

print(f"done: {i} messages -> {OUT}")

Run:

cd ~/pst_project
source .venv/bin/activate
python parse_eml_to_jsonl.py

Part D — Optional Qwen/Gemma extraction

Use this only if you want enriched metadata (summary, category, action items, entities).

Create enrich_with_qwen.py:

import json, requests
from pathlib import Path

INP = Path("emails.jsonl")
OUT = Path("emails.enriched.jsonl")
OLLAMA = "http://localhost:11434"
MODEL = "qwen2.5:7b"  # or gemma2:9b

prompt_t = """Extract JSON with keys:
summary, category, action_items, people, organizations, dates, sensitive_info_present.
Email:\n{email}\nReturn JSON only."""

with INP.open("r", encoding="utf-8") as f, OUT.open("w", encoding="utf-8") as out:
    for line in f:
        rec = json.loads(line)
        email_text = f"Subject: {rec.get('subject','')}\nFrom: {rec.get('from','')}\nDate: {rec.get('date','')}\n\n{rec.get('body','')[:8000]}"
        r = requests.post(f"{OLLAMA}/api/generate", json={
            "model": MODEL,
            "prompt": prompt_t.format(email=email_text),
            "stream": False
        }, timeout=180)
        r.raise_for_status()
        rec["llm_extraction_raw"] = r.json().get("response", "")
        out.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("done")

Part E — Build vector DB (Chroma)

Create build_vector_db.py:

import json, requests, chromadb
from pathlib import Path

INP = Path("emails.jsonl")  # or emails.enriched.jsonl
DB_DIR = "./email_vector_db"
COL = "pst_emails"
OLLAMA = "http://localhost:11434"
EMBED_MODEL = "nomic-embed-text"

client = chromadb.PersistentClient(path=DB_DIR)
collection = client.get_or_create_collection(COL)


def embed(text):
    r = requests.post(f"{OLLAMA}/api/embeddings", json={
        "model": EMBED_MODEL,
        "prompt": text
    }, timeout=120)
    r.raise_for_status()
    return r.json()["embedding"]


def render_doc(rec):
    return f"""Subject: {rec.get('subject','')}
From: {rec.get('from','')}
To: {rec.get('to','')}
CC: {rec.get('cc','')}
Date: {rec.get('date','')}

{rec.get('body','')}
""".strip()

batch_ids, batch_docs, batch_embs, batch_meta = [], [], [], []
with INP.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        rec = json.loads(line)
        doc = render_doc(rec)
        if len(doc) < 20:
            continue
        emb = embed(doc)
        batch_ids.append(str(rec.get("id", i)))
        batch_docs.append(doc)
        batch_embs.append(emb)
        batch_meta.append({
            "subject": rec.get("subject", ""),
            "from": rec.get("from", ""),
            "to": rec.get("to", ""),
            "date": rec.get("date", ""),
            "file": rec.get("file", "")
        })

        if len(batch_ids) >= 50:
            collection.add(ids=batch_ids, documents=batch_docs, embeddings=batch_embs, metadatas=batch_meta)
            batch_ids, batch_docs, batch_embs, batch_meta = [], [], [], []

if batch_ids:
    collection.add(ids=batch_ids, documents=batch_docs, embeddings=batch_embs, metadatas=batch_meta)

print("vector DB ready at", DB_DIR)