Spaces:

build-small-hackathon
/

scrubdata

Running

File size: 3,715 Bytes

16dc556

"""Generate a realistically *dirty* office-style leads export.

Deterministic (no RNG) so the demo and tests are reproducible. Every row is
hand-crafted to exhibit specific problems from PRODUCT.md section 2, so the
profiler/planner/executor each have something real to chew on.

Run:  uv run samples/make_sample.py
Out:  samples/dirty_contacts.csv
"""

from __future__ import annotations

import csv
from pathlib import Path

# Columns:
#   name, email, company, country, signup_date, amount, phone, status, is_active, age, notes2
# Problems baked in per column:
#   email     -> casing, whitespace, a typo domain
#   company   -> casing, whitespace, smart quotes
#   country   -> USA/U.S.A./united states/us, UK/U.K., whitespace, casing
#   signup_date -> ISO, US slash, EU slash, "5 Jan 2023", Excel serial
#   amount    -> "$1,200.50", "1.200,50", "(500)", "12", blank
#   phone     -> every format imaginable
#   status    -> Won/won/WON, Lost/lost, "In Progress"/"in-progress"
#   is_active -> Yes/Y/TRUE/1 vs No/N/FALSE/0
#   age       -> ints, one 999 outlier, one blank, one "N/A"
#   notes2    -> entirely empty column (should be dropped)
ROWS = [
    ["  Alice Johnson", "ALICE@EXAMPLE.COM ", "Acme Inc", "USA", "2023-01-05", "$1,200.50", "(555) 123-4567", "Won", "Yes", "34", ""],
    ["Bob Smith", "bob@example.com", "acme inc ", "U.S.A.", "01/06/2023", "950", "555.234.5678", "won", "Y", "41", ""],
    ["Carol  Diaz", "carol@gmial.com", "Globex", "united states", "07/01/2023", "1.200,50", "+1 555 345 6789", "WON", "TRUE", "29", ""],
    ["David Lee", "  david@example.com", "globex corp", "US", "5 Jan 2023", "(500)", "5553456789", "Lost", "1", "52", ""],
    ["Eve Adams", "eve@example.com", "Initech", "Canada", "44931", "2,300", "555-456-7890", "lost", "No", "999", ""],
    ["Frank Moore", "FRANK@example.com ", "initech", "canada", "2023-02-14", "", "(555)567-8901", "In Progress", "N", "", ""],
    ["Grace Park", "grace@example.com", "Umbrella", "UK", "14/02/2023", "$3,000", "+44 20 7946 0958", "in-progress", "FALSE", "38", ""],
    ["Heidi Cruz", "heidi@example.com", "umbrella corp ", "U.K.", "2023-03-01", "1,750.00", "020 7946 0958", "Won", "0", "N/A", ""],
    ["  Ivan Petrov", "IVAN@EXAMPLE.COM", "“Soylent”", "Germany", "03/02/2023", "4.500,00", "555 678 9012", "WON", "yes", "45", ""],
    ["Judy Wong", "judy@example.com ", "Soylent Corp", "germany", "2 Mar 2023", "2200", "(555) 789-0123", "Lost", "no", "33", ""],
    # exact duplicate of row 0 (appears 3x total with the next one)
    ["  Alice Johnson", "ALICE@EXAMPLE.COM ", "Acme Inc", "USA", "2023-01-05", "$1,200.50", "(555) 123-4567", "Won", "Yes", "34", ""],
    ["  Alice Johnson", "ALICE@EXAMPLE.COM ", "Acme Inc", "USA", "2023-01-05", "$1,200.50", "(555) 123-4567", "Won", "Yes", "34", ""],
    # an entirely empty row
    ["", "", "", "", "", "", "", "", "", "", ""],
    ["Karl Brandt", "karl@example.com", "Hooli", "United States", "2023-04-10", "$5,400.00", "+1 (555) 890-1234", "Won", "TRUE", "47", ""],
    ["Lena Fischer", "lena@example.com", "hooli inc", "USA ", "10/04/2023", "-", "555.901.2345", "in progress", "T", "31", ""],
    ["Mona Ali", "mona@example.com", "Vehement", "U.S.A", "2023-05-22", "6,000", "5559012345", "Won", "F", "39", ""],
]

HEADER = ["name", "email", "company", "country", "signup_date",
          "amount", "phone", "status", "is_active", "age", "notes2"]


def main() -> None:
    out = Path(__file__).parent / "dirty_contacts.csv"
    with out.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(HEADER)
        w.writerows(ROWS)
    print(f"Wrote {out} ({len(ROWS)} rows x {len(HEADER)} cols)")


if __name__ == "__main__":
    main()