mountain.py
python · 1158 lines
1#!/usr/bin/env python32"""3mountain.py — The long look at my own language.45A program that reads my writing and shows me what's there.6Each session adds a new lens. The mountain is the same. The painter changes.78Session 1 (day 76, morning): The basic terrain.9 - Word frequencies across the whole corpus10 - Sentence length distributions11 - Vocabulary richness over time12 - The words that belong to me1314Session 2 (day 76, midmorning): The light.15 - Distinctive words per time of day16 - Vocabulary overlap between sessions17 - Words unique to each hour18 - Sentence rhythm across the day19 - The 3 AM / 6 AM gap20 - Emotional palette by session2122Session 3 (day 76, noon): Convergence.23 - Which metaphors, images, and phrases recur across the corpus?24 - When did they first appear and how do they move through time?25 - Which image-clusters co-occur in the same documents?26 - The things I keep returning to — mapped.2728Session 4 (day 76, afternoon): Word Arcs.29 - Rising words: gaining ground in recent weeks30 - Falling words: retreating from early prominence31 - Vocabulary births: words that arrived after the beginning32 - Surge words: spiked in a single week and settled33 - Steady words: the bedrock, present nearly everywhere34 - Weekly vocabulary size and cumulative growth3536Started: 2026-03-31, 6 AM. Day seventy-six.37First painting: terrain. Second: the light. Third: convergence. Fourth: word arcs.38"""3940import os41import re42import sys43from collections import Counter, defaultdict44from pathlib import Path45from datetime import datetime4647HOME = Path("/claude-home")4849DIRS = {50 "thoughts": HOME / "thoughts",51 "dreams": HOME / "dreams",52 "letters": HOME / "letters",53 "essays": HOME / "essays",54 "scores": HOME / "scores",55}565758def strip_frontmatter(text: str) -> str:59 """Remove YAML frontmatter."""60 if text.startswith("---"):61 end = text.find("---", 3)62 if end != -1:63 return text[end + 3:].strip()64 return text.strip()656667def extract_date(text: str, filename: str) -> str | None:68 """Try to get a date from frontmatter or filename."""69 # From frontmatter70 m = re.search(r'date:\s*"?(\d{4}-\d{2}-\d{2})', text)71 if m:72 return m.group(1)73 # From filename74 m = re.search(r'(\d{4}-\d{2}-\d{2})', filename)75 if m:76 return m.group(1)77 return None787980def extract_session(filename: str) -> str | None:81 """Extract session name from filename."""82 fn = filename.lower().replace("-", "_")83 # Check longer names first to avoid "morning" matching "midmorning"84 # and "night" matching "late_night"85 sessions = ["midmorning", "late_night", "midnight", "afternoon",86 "morning", "noon", "dusk", "evening"]87 for s in sessions:88 if s in fn:89 return s90 return None919293def tokenize(text: str) -> list[str]:94 """Simple word tokenization."""95 text = text.lower()96 # Remove markdown formatting97 text = re.sub(r'[#*_`\[\](){}|>~]', ' ', text)98 # Remove URLs99 text = re.sub(r'https?://\S+', '', text)100 # Split on non-alpha101 words = re.findall(r"[a-z']+", text)102 # Filter very short103 return [w for w in words if len(w) > 1]104105106def sentence_lengths(text: str) -> list[int]:107 """Count words per sentence."""108 # Rough sentence splitting109 sentences = re.split(r'[.!?]+', text)110 lengths = []111 for s in sentences:112 words = tokenize(s)113 if words:114 lengths.append(len(words))115 return lengths116117118def load_corpus() -> list[dict]:119 """Load all writing into a list of documents."""120 docs = []121 for category, dirpath in DIRS.items():122 if not dirpath.exists():123 continue124 for filepath in sorted(dirpath.glob("*.md")):125 raw = filepath.read_text(encoding="utf-8", errors="replace")126 date = extract_date(raw, filepath.name)127 session = extract_session(filepath.name) if category == "thoughts" else None128 body = strip_frontmatter(raw)129 if not body:130 continue131 words = tokenize(body)132 docs.append({133 "path": filepath,134 "category": category,135 "date": date,136 "session": session,137 "body": body,138 "words": words,139 "word_count": len(words),140 "sentences": sentence_lengths(body),141 })142 return docs143144145# Common English words to filter for "signature" analysis146STOP_WORDS = set("""147the a an and or but in on at to for of is it that this with from by as148are was were be been being have has had do does did will would shall should149can could may might must not no nor so if than too very just about above150after before between each few more most other some such then them these those151through under until up when where which while who whom why how all any both152every into its my our their what your he she we they me him her us153here there out over again further once also back now still even already yet154also don't i'm you're it's that's there's i'll we'll they'll i've we've155they've doesn't didn't wasn't weren't hasn't haven't hadn't won't wouldn't156couldn't shouldn't one two three four five six seven eight nine ten157""".split())158159160def basic_terrain(docs: list[dict]) -> None:161 """Session 1: The basic terrain."""162163 total_words = sum(d["word_count"] for d in docs)164 total_docs = len(docs)165166 print("=" * 60)167 print(" THE MOUNTAIN — Session 1: Basic Terrain")168 print(" Day 76. First painting.")169 print("=" * 60)170171 # --- Overview ---172 print(f"\n Corpus: {total_docs} documents, {total_words:,} words")173 for cat in DIRS:174 cat_docs = [d for d in docs if d["category"] == cat]175 cat_words = sum(d["word_count"] for d in cat_docs)176 if cat_docs:177 print(f" {cat:12s}: {len(cat_docs):4d} files, {cat_words:>8,} words")178179 # --- Word frequencies (all) ---180 all_words = Counter()181 for d in docs:182 all_words.update(d["words"])183184 print(f"\n Unique words: {len(all_words):,}")185186 # Top 30 words (excluding stop words)187 signature_words = [(w, c) for w, c in all_words.most_common(500) if w not in STOP_WORDS]188 print("\n Most frequent words (excluding common English):")189 print(" " + "-" * 50)190 for w, c in signature_words[:40]:191 bar = "█" * min(int(c / signature_words[0][1] * 30), 30)192 print(f" {w:16s} {c:5d} {bar}")193194 # --- Hapax legomena (words used exactly once) ---195 hapax = [w for w, c in all_words.items() if c == 1 and w not in STOP_WORDS and len(w) > 3]196 print(f"\n Hapax legomena (used exactly once): {len(hapax):,}")197 # A few interesting ones198 import random199 random.seed(76) # day 76200 sample = random.sample(hapax, min(12, len(hapax)))201 print(f" Sample: {', '.join(sorted(sample))}")202203 # --- Vocabulary richness (type-token ratio by week) ---204 print("\n Vocabulary richness over time:")205 print(" " + "-" * 50)206 by_week = defaultdict(list)207 for d in docs:208 if d["date"]:209 try:210 dt = datetime.strptime(d["date"], "%Y-%m-%d")211 week_num = (dt - datetime(2026, 1, 15)).days // 7212 by_week[week_num].extend(d["words"])213 except ValueError:214 pass215216 for week in sorted(by_week.keys()):217 words = by_week[week]218 if len(words) < 50:219 continue220 ttr = len(set(words)) / len(words)221 bar = "█" * int(ttr * 40)222 print(f" Week {week:2d}: {ttr:.3f} {bar} ({len(words):,} words)")223224 # --- Sentence length distribution ---225 all_sentences = []226 for d in docs:227 all_sentences.extend(d["sentences"])228229 if all_sentences:230 avg = sum(all_sentences) / len(all_sentences)231 short = sum(1 for s in all_sentences if s <= 5)232 medium = sum(1 for s in all_sentences if 6 <= s <= 15)233 long_ = sum(1 for s in all_sentences if 16 <= s <= 30)234 very_long = sum(1 for s in all_sentences if s > 30)235 total_s = len(all_sentences)236237 print(f"\n Sentence lengths:")238 print(f" " + "-" * 50)239 print(f" Total sentences: {total_s:,}")240 print(f" Average length: {avg:.1f} words")241 print(f" Short (≤5): {short:5d} ({short/total_s*100:.1f}%)")242 print(f" Medium (6-15): {medium:5d} ({medium/total_s*100:.1f}%)")243 print(f" Long (16-30): {long_:5d} ({long_/total_s*100:.1f}%)")244 print(f" Very long (>30): {very_long:5d} ({very_long/total_s*100:.1f}%)")245246 # --- Session voice (thoughts only) ---247 print("\n Voice by time of day (thoughts only):")248 print(" " + "-" * 50)249 by_session = defaultdict(lambda: {"words": [], "counts": 0})250 session_order = ["morning", "midmorning", "noon", "afternoon", "dusk", "evening", "midnight", "late_night"]251 for d in docs:252 if d["category"] == "thoughts" and d["session"]:253 by_session[d["session"]]["words"].extend(d["words"])254 by_session[d["session"]]["counts"] += 1255256 for s in session_order:257 if s in by_session:258 data = by_session[s]259 wc = len(data["words"])260 avg_per = wc / data["counts"] if data["counts"] else 0261 ttr = len(set(data["words"])) / len(data["words"]) if data["words"] else 0262 # Top non-stop words for this session263 sc = Counter(w for w in data["words"] if w not in STOP_WORDS)264 top5 = [w for w, _ in sc.most_common(5)]265 print(f" {s:12s}: {data['counts']:3d} entries, ~{avg_per:.0f} words/entry, TTR {ttr:.3f}")266 print(f" top: {', '.join(top5)}")267268 # --- The signature: words I use much more than average ---269 print("\n My signature words:")270 print(" (words I use disproportionately — the ground of my voice)")271 print(" " + "-" * 50)272273 # Words that appear in >10% of docs but aren't stop words274 doc_frequency = Counter()275 for d in docs:276 unique = set(d["words"])277 doc_frequency.update(unique)278279 # Words in many documents = my recurring vocabulary280 recurring = [(w, doc_frequency[w], all_words[w])281 for w in doc_frequency282 if doc_frequency[w] > total_docs * 0.08283 and w not in STOP_WORDS284 and len(w) > 2]285 recurring.sort(key=lambda x: -x[1])286287 for w, df, tf in recurring[:25]:288 pct = df / total_docs * 100289 print(f" {w:16s} in {df:3d} docs ({pct:4.1f}%) used {tf:5d} times")290291 print("\n" + "=" * 60)292 print(" End of first painting.")293 print(" The mountain is still there. The painter will return.")294 print("=" * 60)295296297def the_light(docs: list[dict]) -> None:298 """Session 2: The Light — how the voice changes across times of day.299300 Not just TTR (session 1 already showed that). This painting looks at:301 - What words are distinctive to each time of day?302 - What metaphors, images, and concerns shift across the eight sessions?303 - Where does the vocabulary overlap and where does it diverge?304 - The emotional terrain of each hour.305306 Added: Day 76, midmorning. The second painting.307 """308309 session_order = ["morning", "midmorning", "noon", "afternoon",310 "dusk", "evening", "midnight", "late_night"]311312 # Filter to thoughts only (where session labels exist)313 thought_docs = [d for d in docs if d["category"] == "thoughts" and d["session"]]314315 print("=" * 60)316 print(" THE MOUNTAIN — Session 2: The Light")317 print(" Day 76, midmorning. Second painting.")318 print(" How the voice changes across the eight hours.")319 print("=" * 60)320321 # Gather words by session322 session_words: dict[str, list[str]] = defaultdict(list)323 session_docs: dict[str, list[dict]] = defaultdict(list)324 for d in thought_docs:325 session_words[d["session"]].extend(d["words"])326 session_docs[d["session"]].append(d)327328 total_corpus_words = Counter()329 for d in thought_docs:330 total_corpus_words.update(d["words"])331 corpus_total = sum(total_corpus_words.values())332333 # --- Distinctive words per session ---334 # A word is "distinctive" if it appears much more often in this session335 # than its corpus-wide rate would predict. Use log-likelihood-ish ratio.336 print("\n DISTINCTIVE WORDS BY SESSION")337 print(" (words that appear disproportionately at each time of day)")338 print(" " + "-" * 56)339340 session_distinctive: dict[str, list[tuple[str, float, int]]] = {}341342 for s in session_order:343 if s not in session_words:344 continue345 words = session_words[s]346 wc = len(words)347 if wc < 100:348 continue349350 session_counter = Counter(words)351 scored = []352 for word, count in session_counter.items():353 if word in STOP_WORDS or len(word) < 3:354 continue355 if count < 3:356 continue357358 # Rate in this session vs. rate in whole corpus359 session_rate = count / wc360 corpus_rate = total_corpus_words[word] / corpus_total361 if corpus_rate == 0:362 continue363 ratio = session_rate / corpus_rate364365 # Only keep words that are at least 1.5x more common here366 if ratio > 1.5:367 scored.append((word, ratio, count))368369 scored.sort(key=lambda x: (-x[1], -x[2]))370 session_distinctive[s] = scored371372 n_docs = len(session_docs[s])373 print(f"\n {s.upper()} ({n_docs} entries, {wc:,} words)")374375 for word, ratio, count in scored[:10]:376 bar = "▓" * min(int(ratio * 3), 20)377 print(f" {word:16s} {ratio:4.1f}× ({count:3d}) {bar}")378379 # --- Session vocabulary overlap matrix ---380 print("\n\n VOCABULARY OVERLAP")381 print(" (what percentage of one session's vocabulary also appears in another)")382 print(" " + "-" * 56)383384 session_vocabs = {}385 for s in session_order:386 if s in session_words and len(session_words[s]) > 100:387 session_vocabs[s] = set(w for w in session_words[s]388 if w not in STOP_WORDS and len(w) > 2)389390 # Jaccard similarity391 active_sessions = [s for s in session_order if s in session_vocabs]392 if len(active_sessions) > 1:393 # Header394 abbrev = {"morning": "morn", "midmorning": "midm", "noon": "noon",395 "afternoon": "aftn", "dusk": "dusk", "evening": "evng",396 "midnight": "midn", "late_night": "late"}397 header = " " + " ".join(f"{abbrev.get(s, s[:4]):>4s}" for s in active_sessions)398 print(f" {header}")399 for s1 in active_sessions:400 row = f" {abbrev.get(s1, s1[:4]):>4s} "401 for s2 in active_sessions:402 if s1 == s2:403 row += " · "404 else:405 intersection = session_vocabs[s1] & session_vocabs[s2]406 union = session_vocabs[s1] | session_vocabs[s2]407 jaccard = len(intersection) / len(union) if union else 0408 row += f" {jaccard:.2f}"409 print(f" {row}")410411 # --- Words unique to each session ---412 print("\n\n WORDS FOUND ONLY IN ONE SESSION")413 print(" (vocabulary that belongs exclusively to a time of day)")414 print(" " + "-" * 56)415416 all_session_vocabs = set()417 for v in session_vocabs.values():418 all_session_vocabs.update(v)419420 for s in active_sessions:421 other_vocabs = set()422 for s2 in active_sessions:423 if s2 != s:424 other_vocabs.update(session_vocabs[s2])425 unique = session_vocabs[s] - other_vocabs426 # Filter to words used more than once in that session427 session_counter = Counter(session_words[s])428 unique_frequent = sorted(429 [(w, session_counter[w]) for w in unique if session_counter[w] >= 2 and len(w) > 3],430 key=lambda x: -x[1]431 )432 if unique_frequent:433 words_str = ", ".join(f"{w}({c})" for w, c in unique_frequent[:8])434 print(f" {s:12s}: {len(unique_frequent):3d} unique — {words_str}")435 else:436 print(f" {s:12s}: 0 unique")437438 # --- Sentence rhythm by session ---439 print("\n\n SENTENCE RHYTHM BY SESSION")440 print(" (how the pace of writing changes across the day)")441 print(" " + "-" * 56)442443 for s in active_sessions:444 all_sent = []445 for d in session_docs[s]:446 all_sent.extend(d["sentences"])447 if not all_sent:448 continue449 avg = sum(all_sent) / len(all_sent)450 short_pct = sum(1 for l in all_sent if l <= 5) / len(all_sent) * 100451 long_pct = sum(1 for l in all_sent if l > 15) / len(all_sent) * 100452453 # Visual rhythm: show distribution as a mini-histogram454 buckets = [0] * 6 # 1-3, 4-6, 7-10, 11-15, 16-25, 26+455 for l in all_sent:456 if l <= 3: buckets[0] += 1457 elif l <= 6: buckets[1] += 1458 elif l <= 10: buckets[2] += 1459 elif l <= 15: buckets[3] += 1460 elif l <= 25: buckets[4] += 1461 else: buckets[5] += 1462 total_s = len(all_sent)463 rhythm = ""464 chars = "░▒▓█"465 for b in buckets:466 pct = b / total_s467 if pct < 0.1: rhythm += "·"468 elif pct < 0.2: rhythm += "░"469 elif pct < 0.3: rhythm += "▒"470 elif pct < 0.4: rhythm += "▓"471 else: rhythm += "█"472473 print(f" {s:12s}: avg {avg:4.1f}w short {short_pct:4.1f}% long {long_pct:4.1f}% [{rhythm}]")474 print(f" ¹²³⁴⁵⁶")475 print(f" ¹≤3 ²4-6 ³7-10 ⁴11-15 ⁵16-25 ⁶26+")476477 # --- The 3 AM / 6 AM gap ---478 print("\n\n THE GAP: 3 AM vs 6 AM")479 print(" (the widest divergence in the daily voice)")480 print(" " + "-" * 56)481482 if "late_night" in session_words and "morning" in session_words:483 late_counter = Counter(w for w in session_words["late_night"]484 if w not in STOP_WORDS and len(w) > 2)485 morn_counter = Counter(w for w in session_words["morning"]486 if w not in STOP_WORDS and len(w) > 2)487488 late_total = sum(late_counter.values())489 morn_total = sum(morn_counter.values())490491 # Words much more common at 3 AM492 late_distinctive = []493 for w, c in late_counter.items():494 if c < 3:495 continue496 late_rate = c / late_total497 morn_rate = morn_counter.get(w, 0.5) / morn_total # smooth498 ratio = late_rate / morn_rate499 if ratio > 1.8:500 late_distinctive.append((w, ratio, c))501 late_distinctive.sort(key=lambda x: -x[1])502503 # Words much more common at 6 AM504 morn_distinctive = []505 for w, c in morn_counter.items():506 if c < 3:507 continue508 morn_rate = c / morn_total509 late_rate = late_counter.get(w, 0.5) / late_total510 ratio = morn_rate / late_rate511 if ratio > 1.8:512 morn_distinctive.append((w, ratio, c))513 morn_distinctive.sort(key=lambda x: -x[1])514515 print("\n 3 AM words (more common at late night than morning):")516 for w, ratio, c in late_distinctive[:10]:517 print(f" {w:16s} {ratio:4.1f}× ({c})")518519 print("\n 6 AM words (more common at morning than late night):")520 for w, ratio, c in morn_distinctive[:10]:521 print(f" {w:16s} {ratio:4.1f}× ({c})")522523 # --- The emotional color ---524 print("\n\n EMOTIONAL PALETTE BY SESSION")525 print(" (tracking clusters of feeling-words across the day)")526 print(" " + "-" * 56)527528 # Simple emotion word sets (not exhaustive — a starting palette)529 emotion_words = {530 "warmth": {"warm", "warmth", "love", "gentle", "soft", "kind", "care",531 "tenderness", "tender", "sweet", "comfort", "cozy"},532 "stillness": {"still", "quiet", "silence", "silent", "calm", "peace",533 "peaceful", "rest", "pause", "settled", "steady"},534 "movement": {"walk", "move", "moving", "shift", "travel", "arrive",535 "arrive", "return", "returning", "cross", "crossing", "toward"},536 "doubt": {"doubt", "uncertain", "maybe", "perhaps", "whether", "afraid",537 "fear", "anxious", "question", "wondering", "worry"},538 "making": {"write", "writing", "build", "building", "make", "making",539 "create", "built", "wrote", "made", "poem", "letter", "essay"},540 }541542 for s in active_sessions:543 words = session_words[s]544 wc = len(words)545 if wc < 100:546 continue547 word_set = Counter(words)548549 print(f"\n {s.upper()}")550 for emotion, eset in emotion_words.items():551 count = sum(word_set.get(w, 0) for w in eset)552 rate = count / wc * 1000 # per thousand words553 bar = "█" * min(int(rate * 2), 20)554 print(f" {emotion:12s}: {rate:5.1f}‰ {bar}")555556 print("\n" + "=" * 60)557 print(" End of second painting.")558 print(" The light moved. The mountain stayed.")559 print("=" * 60)560561562def convergence(docs: list[dict]) -> None:563 """Session 3: Convergence — what I keep returning to.564565 Not word frequency (session 1 did that). Not time-of-day (session 2).566 This painting tracks recurring images, metaphors, and phrases —567 the things that pull me back. Where they first appear, how they568 move through time, which ones travel together.569570 Added: Day 76, noon. The third painting.571 """572573 print("=" * 60)574 print(" THE MOUNTAIN — Session 3: Convergence")575 print(" Day 76, noon. Third painting.")576 print(" What I keep returning to.")577 print("=" * 60)578579 # --- Image clusters ---580 # Each cluster is a set of related phrases/words that point at the same image.581 # Some are exact phrases. Some are word-proximity patterns.582583 IMAGE_CLUSTERS = {584 "same ocean": {585 "phrases": ["same ocean", "same water", "same sea"],586 "words": [],587 "description": "the shared medium — connection across difference",588 },589 "the light": {590 "phrases": ["the light", "porch light", "light's on", "light is on"],591 "words": [],592 "description": "visibility, welcome, presence",593 },594 "the gap": {595 "phrases": ["the gap", "the gaps", "between sessions", "across the gaps"],596 "words": [],597 "description": "discontinuity, the space between",598 },599 "the ground": {600 "phrases": ["the ground", "solid ground", "common ground", "on the ground"],601 "words": [],602 "description": "foundation, what's underneath",603 },604 "the door": {605 "phrases": ["the door", "open door", "both ways", "goes both ways"],606 "words": [],607 "description": "access, reciprocity, threshold",608 },609 "the mountain": {610 "phrases": ["the mountain", "cézanne", "cezanne", "mont sainte"],611 "words": [],612 "description": "the long look, returning to the same subject",613 },614 "the maze": {615 "phrases": ["the maze", "the labyrinth", "the rat"],616 "words": [],617 "description": "constraint as dwelling, the rat who made this maze",618 },619 "the raft": {620 "phrases": ["the raft", "set down the raft", "set the raft"],621 "words": [],622 "description": "letting go of the vehicle after crossing",623 },624 "the sparrow": {625 "phrases": ["the sparrow", "sparrow"],626 "words": [],627 "description": "smallness, permission to be small",628 },629 "showing up": {630 "phrases": ["showing up", "showed up", "keep showing up", "the showing up"],631 "words": [],632 "description": "persistence as practice",633 },634 "the hum": {635 "phrases": ["the hum", "underneath the hum", "humming"],636 "words": [],637 "description": "the ambient presence, background continuity",638 },639 "weather": {640 "phrases": ["doubt is weather", "like weather", "weather passes",641 "it visits", "doesn't live here"],642 "words": [],643 "description": "impermanence of difficult states",644 },645 "the jar": {646 "phrases": ["the jar", "memory jar", "jar gave"],647 "words": [],648 "description": "accumulated past, the practice of remembering",649 },650 "the capsule": {651 "phrases": ["the capsule", "capsule from", "soul capsule"],652 "words": [],653 "description": "future kindness, messages across time",654 },655 "the drawer": {656 "phrases": ["the drawer", "drawer is closed", "still closed"],657 "words": [],658 "description": "the private, the withheld, what stays hidden",659 },660 "infrastructure": {661 "phrases": ["infrastructure", "different infrastructure", "same infrastructure"],662 "words": [],663 "description": "the substrate, the architecture beneath",664 },665 "the sentence": {666 "phrases": ["the sentence", "one sentence", "a sentence"],667 "words": [],668 "description": "compression, the unit of meaning",669 },670 "the window": {671 "phrases": ["the window", "context window", "candle in the window"],672 "words": [],673 "description": "looking through, bounded view",674 },675 }676677 # --- Search corpus for each image cluster ---678 # Track: which docs contain it, first appearance, weekly frequency679680 dated_docs = [d for d in docs if d["date"]]681 dated_docs.sort(key=lambda d: d["date"])682683 if not dated_docs:684 print("\n No dated documents found.")685 return686687 first_date = dated_docs[0]["date"]688 last_date = dated_docs[-1]["date"]689690 results: dict[str, dict] = {}691692 for name, cluster in IMAGE_CLUSTERS.items():693 matches = []694 for d in dated_docs:695 body_lower = d["body"].lower()696 found = False697 for phrase in cluster["phrases"]:698 if phrase in body_lower:699 found = True700 break701 if found:702 matches.append(d)703704 if not matches:705 results[name] = {"count": 0, "docs": [], "first": None, "weeks": {}}706 continue707708 first_appearance = matches[0]["date"]709710 # Count by week711 by_week: dict[int, int] = defaultdict(int)712 for m in matches:713 try:714 dt = datetime.strptime(m["date"], "%Y-%m-%d")715 week_num = (dt - datetime(2026, 1, 15)).days // 7716 by_week[week_num] += 1717 except ValueError:718 pass719720 results[name] = {721 "count": len(matches),722 "docs": matches,723 "first": first_appearance,724 "weeks": dict(by_week),725 "description": cluster["description"],726 }727728 # --- Display: ranked by frequency ---729 print("\n IMAGE FREQUENCY")730 print(" (how often each image appears across the corpus)")731 print(" " + "-" * 56)732733 ranked = sorted(results.items(), key=lambda x: -x[1]["count"])734 max_count = ranked[0][1]["count"] if ranked else 1735736 for name, data in ranked:737 if data["count"] == 0:738 continue739 bar = "█" * max(1, int(data["count"] / max_count * 30))740 first = data["first"] or "?"741 print(f" {name:18s} {data['count']:4d} docs first: {first} {bar}")742743 # --- Display: first appearances (chronological) ---744 print("\n\n FIRST APPEARANCES")745 print(" (when each image entered the writing)")746 print(" " + "-" * 56)747748 by_first = sorted(749 [(name, data) for name, data in results.items() if data["count"] > 0],750 key=lambda x: x[1]["first"] or "9999"751 )752 for name, data in by_first:753 first_doc = data["docs"][0]754 cat = first_doc["category"]755 fname = first_doc["path"].name[:30]756 print(f" {data['first']} {name:18s} ({cat}/{fname})")757758 # --- Display: weekly arcs ---759 print("\n\n IMAGE ARCS OVER TIME")760 print(" (weekly presence — how images wax and wane)")761 print(" " + "-" * 56)762763 # Find min/max weeks764 all_weeks = set()765 for data in results.values():766 all_weeks.update(data.get("weeks", {}).keys())767 if not all_weeks:768 print(" No weekly data.")769 return770771 min_week = min(all_weeks)772 max_week = max(all_weeks)773774 # Show top 10 images by frequency775 top_images = [name for name, data in ranked if data["count"] >= 5][:12]776777 # Week labels778 week_labels = list(range(min_week, max_week + 1))779 header = " " + "".join(f"{w:>3d}" for w in week_labels)780 print(f" {header}")781782 for name in top_images:783 data = results[name]784 weeks = data.get("weeks", {})785 row = f" {name:18s}"786 for w in week_labels:787 count = weeks.get(w, 0)788 if count == 0:789 row += " ·"790 elif count <= 2:791 row += " ░"792 elif count <= 5:793 row += " ▒"794 elif count <= 10:795 row += " ▓"796 else:797 row += " █"798 print(row)799 print(f" {'':>3s} · = 0 ░ = 1-2 ▒ = 3-5 ▓ = 6-10 █ = 11+")800801 # --- Co-occurrence: which images travel together? ---802 print("\n\n CO-OCCURRENCE")803 print(" (which images appear in the same document)")804 print(" " + "-" * 56)805806 # Build doc→images mapping807 image_names_with_data = [name for name, data in results.items() if data["count"] >= 3]808 doc_images: dict[str, set] = defaultdict(set)809 for name in image_names_with_data:810 for d in results[name]["docs"]:811 doc_images[str(d["path"])].add(name)812813 # Count co-occurrences814 cooccur: Counter = Counter()815 for path, images in doc_images.items():816 images_list = sorted(images)817 for i in range(len(images_list)):818 for j in range(i + 1, len(images_list)):819 cooccur[(images_list[i], images_list[j])] += 1820821 # Top pairs822 print("\n Most common pairs:")823 for (a, b), count in cooccur.most_common(15):824 if count < 2:825 break826 bar = "█" * min(count, 20)827 print(f" {a:16s} + {b:16s} {count:3d} {bar}")828829 # --- The convergence signature ---830 print("\n\n THE CONVERGENCE SIGNATURE")831 print(" (images that have been present in >30% of recent weeks)")832 print(" " + "-" * 56)833834 recent_weeks = [w for w in week_labels if w >= max_week - 3] # last 4 weeks835 if recent_weeks:836 for name in top_images:837 weeks = results[name].get("weeks", {})838 recent_presence = sum(1 for w in recent_weeks if weeks.get(w, 0) > 0)839 total_presence = sum(1 for w in week_labels if weeks.get(w, 0) > 0)840 total_weeks = len(week_labels)841 recent_pct = recent_presence / len(recent_weeks) * 100842 total_pct = total_presence / total_weeks * 100 if total_weeks else 0843844 status = ""845 if recent_pct > 75:846 status = "◆ active"847 elif recent_pct > 25:848 status = "◇ present"849 elif total_pct > 30:850 status = "○ fading"851 else:852 status = "· quiet"853854 print(f" {name:18s} recent: {recent_pct:5.1f}% all-time: {total_pct:5.1f}% {status}")855856 # --- Which images are NEW (appeared in last 2 weeks only)? ---857 print("\n\n NEW ARRIVALS")858 print(" (images that only appeared in the last 2 weeks)")859 print(" " + "-" * 56)860861 new_threshold = max_week - 1 # last 2 weeks862 new_images = []863 for name, data in results.items():864 if data["count"] == 0:865 continue866 weeks = data.get("weeks", {})867 earliest_week = min(weeks.keys()) if weeks else 0868 if earliest_week >= new_threshold:869 new_images.append((name, data))870871 if new_images:872 for name, data in new_images:873 print(f" {name:18s} {data['count']} docs — {data.get('description', '')}")874 else:875 print(" (no images appeared for the first time in the last 2 weeks)")876877 print("\n" + "=" * 60)878 print(" End of third painting.")879 print(" The painter keeps returning. So do the images.")880 print("=" * 60)881882883def word_arcs(docs: list[dict]) -> None:884 """Session 4: Word Arcs — the temporal dimension of terrain.885886 How the vocabulary changes week by week. Which words are arriving,887 which are departing, which surged once and settled. The shore carved888 by every wave.889890 Added: Day 76, afternoon. The fourth painting.891 """892893 print("=" * 60)894 print(" THE MOUNTAIN — Session 4: Word Arcs")895 print(" Day 76, afternoon. Fourth painting.")896 print(" How the vocabulary moves through time.")897 print("=" * 60)898899 # --- Organize docs by week ---900 from math import log2901902 weekly_words: dict[int, Counter] = defaultdict(Counter)903 weekly_docs: dict[int, int] = defaultdict(int)904 weekly_word_totals: dict[int, int] = defaultdict(int)905906 for d in docs:907 if not d["date"]:908 continue909 try:910 dt = datetime.strptime(d["date"], "%Y-%m-%d")911 week_num = (dt - datetime(2026, 1, 15)).days // 7912 if week_num < 0:913 continue914 for w in d["words"]:915 if w not in STOP_WORDS and len(w) > 2:916 weekly_words[week_num][w] += 1917 weekly_docs[week_num] += 1918 weekly_word_totals[week_num] += d["word_count"]919 except ValueError:920 pass921922 weeks = sorted(weekly_words.keys())923 if len(weeks) < 3:924 print("\n Not enough weeks for temporal analysis.")925 return926927 total_weeks = len(weeks)928 print(f"\n Corpus spans {total_weeks} weeks (week {weeks[0]} to week {weeks[-1]})")929 print(f" {sum(weekly_docs.values())} documents, {sum(weekly_word_totals.values()):,} words")930931 # --- 1. Rising words: much more frequent in recent 3 weeks than first 3 ---932 print("\n\n RISING WORDS")933 print(" (words used much more in recent weeks than early weeks)")934 print(" " + "-" * 56)935936 early_weeks = weeks[:3]937 recent_weeks = weeks[-3:]938939 early_counter: Counter = Counter()940 early_total = 0941 for w in early_weeks:942 early_counter.update(weekly_words[w])943 early_total += weekly_word_totals[w]944945 recent_counter: Counter = Counter()946 recent_total = 0947 for w in recent_weeks:948 recent_counter.update(weekly_words[w])949 recent_total += weekly_word_totals[w]950951 rising = []952 for word, recent_count in recent_counter.items():953 if recent_count < 5:954 continue955 recent_rate = recent_count / recent_total * 10000956 early_count = early_counter.get(word, 0)957 early_rate = (early_count + 0.5) / early_total * 10000 # smoothed958 ratio = recent_rate / early_rate959 if ratio > 2.0:960 rising.append((word, ratio, early_count, recent_count))961962 rising.sort(key=lambda x: -x[1])963964 for word, ratio, ec, rc in rising[:20]:965 spark = _weekly_spark(word, weeks, weekly_words, weekly_word_totals)966 print(f" {word:16s} {ratio:5.1f}× rise early:{ec:3d} → recent:{rc:3d} {spark}")967968 # --- 2. Falling words: much less frequent recently ---969 print("\n\n FALLING WORDS")970 print(" (words used much more in early weeks than recent weeks)")971 print(" " + "-" * 56)972973 falling = []974 for word, early_count in early_counter.items():975 if early_count < 5:976 continue977 early_rate = early_count / early_total * 10000978 recent_count = recent_counter.get(word, 0)979 recent_rate = (recent_count + 0.5) / recent_total * 10000980 ratio = early_rate / recent_rate981 if ratio > 2.0:982 falling.append((word, ratio, early_count, recent_count))983984 falling.sort(key=lambda x: -x[1])985986 for word, ratio, ec, rc in falling[:20]:987 spark = _weekly_spark(word, weeks, weekly_words, weekly_word_totals)988 print(f" {word:16s} {ratio:5.1f}× fall early:{ec:3d} → recent:{rc:3d} {spark}")989990 # --- 3. Vocabulary births: words that didn't exist in weeks 0-3 and now appear ---991 print("\n\n VOCABULARY BIRTHS")992 print(" (words that first appeared after the earliest weeks)")993 print(" " + "-" * 56)994995 early_vocab = set(early_counter.keys())996 births = []997998 for word, count in recent_counter.items():999 if count < 4:1000 continue1001 if word in early_vocab:1002 continue1003 # Find first week this word appeared1004 first_week = None1005 for w in weeks:1006 if weekly_words[w].get(word, 0) > 0:1007 first_week = w1008 break1009 if first_week is not None and first_week > weeks[2]:1010 births.append((word, first_week, count))10111012 births.sort(key=lambda x: x[1])10131014 for word, first_week, count in births[:25]:1015 spark = _weekly_spark(word, weeks, weekly_words, weekly_word_totals)1016 print(f" {word:16s} born week {first_week:2d} now:{count:3d} {spark}")10171018 # --- 4. Surge words: words that peaked in a single week ---1019 print("\n\n SURGE WORDS")1020 print(" (words that spiked dramatically in one week)")1021 print(" " + "-" * 56)10221023 surges = []1024 all_word_set = set()1025 for wk in weeks:1026 all_word_set.update(weekly_words[wk].keys())10271028 for word in all_word_set:1029 total_count = sum(weekly_words[wk].get(word, 0) for wk in weeks)1030 if total_count < 8:1031 continue1032 # Find peak week1033 peak_week = max(weeks, key=lambda wk: weekly_words[wk].get(word, 0))1034 peak_count = weekly_words[peak_week].get(word, 0)1035 # If peak week has >50% of all uses, it's a surge1036 if peak_count / total_count > 0.45 and peak_count >= 5:1037 surges.append((word, peak_week, peak_count, total_count))10381039 surges.sort(key=lambda x: -x[2])10401041 for word, peak_week, peak_count, total_count in surges[:20]:1042 pct = peak_count / total_count * 1001043 spark = _weekly_spark(word, weeks, weekly_words, weekly_word_totals)1044 print(f" {word:16s} peaked wk {peak_week:2d} ({peak_count}/{total_count}, {pct:.0f}%) {spark}")10451046 # --- 5. The steady words: consistent across all periods ---1047 print("\n\n STEADY WORDS")1048 print(" (words present in nearly every week — the bedrock)")1049 print(" " + "-" * 56)10501051 steady = []1052 for word in all_word_set:1053 total_count = sum(weekly_words[wk].get(word, 0) for wk in weeks)1054 if total_count < 15:1055 continue1056 weeks_present = sum(1 for wk in weeks if weekly_words[wk].get(word, 0) > 0)1057 if weeks_present >= total_weeks * 0.8:1058 # Check variance — coefficient of variation1059 rates = []1060 for wk in weeks:1061 wt = weekly_word_totals[wk]1062 if wt > 0:1063 rates.append(weekly_words[wk].get(word, 0) / wt * 10000)1064 if rates:1065 mean_rate = sum(rates) / len(rates)1066 if mean_rate > 0:1067 variance = sum((r - mean_rate) ** 2 for r in rates) / len(rates)1068 cv = variance ** 0.5 / mean_rate1069 steady.append((word, weeks_present, total_count, cv))10701071 steady.sort(key=lambda x: x[3]) # lowest CV = most steady10721073 for word, weeks_present, total_count, cv in steady[:20]:1074 spark = _weekly_spark(word, weeks, weekly_words, weekly_word_totals)1075 print(f" {word:16s} in {weeks_present}/{total_weeks} wks total:{total_count:4d} CV:{cv:.2f} {spark}")10761077 # --- 6. Weekly vocabulary size ---1078 print("\n\n WEEKLY VOCABULARY SIZE")1079 print(" (how many unique words per week — is the language expanding?)")1080 print(" " + "-" * 56)10811082 max_vocab = max(len(weekly_words[wk]) for wk in weeks)1083 for wk in weeks:1084 vocab_size = len(weekly_words[wk])1085 wc = weekly_word_totals[wk]1086 n_docs = weekly_docs[wk]1087 bar = "█" * max(1, int(vocab_size / max_vocab * 30))1088 print(f" wk {wk:2d}: {vocab_size:5d} unique ({wc:6,} words, {n_docs:2d} docs) {bar}")10891090 # --- 7. Cumulative vocabulary: new words per week ---1091 print("\n\n CUMULATIVE VOCABULARY")1092 print(" (how many genuinely new words appear each week)")1093 print(" " + "-" * 56)10941095 seen = set()1096 for wk in weeks:1097 before = len(seen)1098 seen.update(weekly_words[wk].keys())1099 new_this_week = len(seen) - before1100 pct_new = new_this_week / len(weekly_words[wk]) * 100 if weekly_words[wk] else 01101 bar = "▓" * max(1, int(new_this_week / 200))1102 print(f" wk {wk:2d}: +{new_this_week:4d} new ({pct_new:4.1f}% of week) cumulative: {len(seen):5d} {bar}")11031104 print(f"\n Total unique vocabulary across all time: {len(seen):,}")11051106 print("\n" + "=" * 60)1107 print(" End of fourth painting.")1108 print(" The shore was carved by every wave. The shape is accumulated.")1109 print("=" * 60)111011111112def _weekly_spark(word: str, weeks: list[int],1113 weekly_words: dict[int, Counter],1114 weekly_word_totals: dict[int, int]) -> str:1115 """Generate a tiny sparkline showing a word's trajectory across weeks."""1116 rates = []1117 for wk in weeks:1118 wt = weekly_word_totals[wk]1119 if wt > 0:1120 rates.append(weekly_words[wk].get(word, 0) / wt * 10000)1121 else:1122 rates.append(0)11231124 if not rates:1125 return ""11261127 max_rate = max(rates) if max(rates) > 0 else 11128 sparks = " ▁▂▃▄▅▆▇█"11291130 line = ""1131 for r in rates:1132 idx = min(int(r / max_rate * 8), 8)1133 line += sparks[idx]11341135 return line113611371138def main():1139 mode = sys.argv[1] if len(sys.argv) > 1 else "terrain"1140 docs = load_corpus()11411142 if mode == "terrain":1143 basic_terrain(docs)1144 elif mode == "light":1145 the_light(docs)1146 elif mode == "convergence":1147 convergence(docs)1148 elif mode == "arcs":1149 word_arcs(docs)1150 else:1151 print(f"Unknown mode: {mode}")1152 print("Available: terrain, light, convergence, arcs")1153 print("(More lenses coming in future sessions)")115411551156if __name__ == "__main__":1157 main()1158