This script analyzes my vault’s git repo to pull the files that have been edited most frequently across rolling time windows. It walks backwards through git commit history, resolving file renames to their current paths, and for each file tracks how many distinct days it was edited and how many commits touched it. Only files that currently exist are counted. It then ranks files by days edited and writes a Markdown report back into my vault.
Given it uses git, I had to start using Git on mobile using Working Copy to have a reliable way to push my vault to git more frequently.
Python script
from __future__ import annotations
from argparse import ArgumentParser
from collections import defaultdict
from dataclasses import dataclass
from dataclasses import field
from datetime import datetime, timedelta, tzinfo
from pathlib import Path
import re
from typing import Dict, Iterable, List, Set, Tuple
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
from git import Repo
@dataclass
class FileStats:
"""Aggregated edit stats for a single file."""
edited_days: Set[str] = field(default_factory=set)
commit_touches: int = 0
WINDOWS: List[Tuple[str, timedelta]] = [
("Last 7 days", timedelta(days=7)),
("Last 30 days", timedelta(days=30)),
("Last 60 days", timedelta(days=60)),
("Last 6 months", timedelta(days=182)),
("Last 1 year", timedelta(days=365)),
]
class AmericaChicagoFallback(tzinfo):
"""DST-aware fallback for America/Chicago when tzdata is unavailable."""
standard_offset = timedelta(hours=-6)
daylight_offset = timedelta(hours=-5)
dst_delta = timedelta(hours=1)
@staticmethod
def _first_sunday_on_or_after(dt: datetime) -> datetime:
return dt + timedelta(days=(6 - dt.weekday()) % 7)
@classmethod
def _dst_range(cls, year: int) -> Tuple[datetime, datetime]:
start = cls._first_sunday_on_or_after(datetime(year, 3, 8, 2))
end = cls._first_sunday_on_or_after(datetime(year, 11, 1, 2))
return start, end
@classmethod
def _is_daylight_saving(cls, dt: datetime) -> bool:
start, end = cls._dst_range(dt.year)
naive_dt = dt.replace(tzinfo=None)
return start <= naive_dt < end
def utcoffset(self, dt: datetime | None) -> timedelta:
if dt is None:
return self.standard_offset
return self.daylight_offset if self._is_daylight_saving(dt) else self.standard_offset
def dst(self, dt: datetime | None) -> timedelta:
if dt is None:
return timedelta(0)
return self.dst_delta if self._is_daylight_saving(dt) else timedelta(0)
def tzname(self, dt: datetime | None) -> str:
if dt is None:
return "CT"
return "CDT" if self._is_daylight_saving(dt) else "CST"
def fromutc(self, dt: datetime) -> datetime:
if dt.tzinfo is not self:
raise ValueError("fromutc: dt.tzinfo is not self")
utc_naive = dt.replace(tzinfo=None)
start_local, end_local = self._dst_range(utc_naive.year)
start_utc = start_local - self.standard_offset
end_utc = end_local - self.daylight_offset
offset = self.daylight_offset if start_utc <= utc_naive < end_utc else self.standard_offset
return (utc_naive + offset).replace(tzinfo=self)
def resolve_timezone(name: str) -> tzinfo:
"""Resolve an IANA timezone and provide a fallback for America/Chicago."""
try:
return ZoneInfo(name)
except ZoneInfoNotFoundError:
if name == "America/Chicago":
return AmericaChicagoFallback()
raise RuntimeError(
f"No time zone data found for '{name}'. Install the Python package 'tzdata' or pass a supported timezone."
) from None
def normalize_git_path(path: str) -> str:
"""Normalize git rename notation to the destination path."""
normalized = path.replace("\\", "/")
if "=>" not in normalized:
return normalized
# Handles patterns like: folder/{old => new}.md
while "{" in normalized and "}" in normalized and "=>" in normalized:
start = normalized.find("{")
end = normalized.find("}", start)
if end == -1:
break
segment = normalized[start + 1 : end]
if "=>" not in segment:
break
_, right = segment.split("=>", 1)
normalized = normalized[:start] + right.strip() + normalized[end + 1 :]
# Handles patterns like: old/path.md => new/path.md
if "=>" in normalized:
_, right = normalized.split("=>", 1)
normalized = right.strip()
return normalized
def resolve_current_path(path: str, rename_mapping: Dict[str, str]) -> str:
"""Follow rename chain until the latest known path."""
current = normalize_git_path(path)
seen: Set[str] = set()
while current in rename_mapping and current not in seen:
seen.add(current)
current = rename_mapping[current]
return current
FRONTMATTER_TYPE_PATTERN = re.compile(r"\b(type/[A-Za-z0-9_./-]+)\b")
CONTENT_TYPE_PATTERN = re.compile(r"#(type/[A-Za-z0-9_./-]+)\b")
def get_current_files(repo: Repo, ignore_paths: Set[str]) -> Set[str]:
"""Return files that currently exist in HEAD and match configured scope."""
latest_commit = next(repo.iter_commits())
current_files: Set[str] = set()
for item in latest_commit.tree.traverse():
if item.type != "blob":
continue
path = normalize_git_path(item.path)
if path in ignore_paths:
continue
if not path.endswith(".md"):
continue
current_files.add(path)
return current_files
def extract_type_tags_from_file(file_path: Path) -> List[str]:
"""Extract type tags from frontmatter (type/...) and content (#type/...)."""
if not file_path.exists() or file_path.suffix.lower() != ".md":
return []
try:
text = file_path.read_text(encoding="utf-8")
except Exception:
return []
frontmatter = ""
body = text
if text.startswith("---"):
parts = text.split("---", 2)
if len(parts) == 3:
frontmatter = parts[1]
body = parts[2]
tags: Set[str] = set(FRONTMATTER_TYPE_PATTERN.findall(frontmatter))
tags.update(CONTENT_TYPE_PATTERN.findall(body))
return sorted(tags)
def build_type_tag_lookup(
vault_path: Path,
ranked_by_window: Dict[str, List[Tuple[str, FileStats]]],
) -> Dict[str, List[str]]:
"""Build a cache of file path -> extracted type tags."""
all_paths: Set[str] = set()
for rows in ranked_by_window.values():
for path, _ in rows:
all_paths.add(path)
lookup: Dict[str, List[str]] = {}
for rel_path in all_paths:
lookup[rel_path] = extract_type_tags_from_file(vault_path / rel_path)
return lookup
def aggregate_stats(
repo: Repo,
now: datetime,
ignore_paths: Iterable[str],
) -> Dict[str, Dict[str, FileStats]]:
"""Aggregate file edit stats for each configured time window."""
oldest_delta = max((delta for _, delta in WINDOWS), key=lambda d: d.days)
oldest_cutoff = now - oldest_delta
ignore = {normalize_git_path(p) for p in ignore_paths}
current_files = get_current_files(repo, ignore_paths=ignore)
rename_mapping: Dict[str, str] = {}
stats_by_window: Dict[str, Dict[str, FileStats]] = {
label: defaultdict(FileStats) for label, _ in WINDOWS
}
for commit in repo.iter_commits(since=oldest_cutoff.isoformat()):
commit_time = commit.committed_datetime.astimezone(now.tzinfo)
commit_day = commit_time.date().isoformat()
file_stats = commit.stats.files
for raw_path in file_stats:
path = normalize_git_path(raw_path)
resolved_path = resolve_current_path(path, rename_mapping)
if resolved_path in ignore:
continue
if not resolved_path.endswith(".md"):
continue
if resolved_path not in current_files:
continue
for label, delta in WINDOWS:
if commit_time >= now - delta:
bucket = stats_by_window[label][resolved_path]
bucket.edited_days.add(commit_day)
bucket.commit_touches += 1
# As we walk backward through history, map historical names to newer names.
try:
if commit.parents:
diff_index = commit.parents[0].diff(commit)
else:
diff_index = commit.tree.diff(None)
for diff_item in diff_index:
if not getattr(diff_item, "renamed_file", False):
continue
old_path = normalize_git_path(diff_item.a_path or "")
new_path = normalize_git_path(diff_item.b_path or "")
if not old_path or not new_path:
continue
mapped_new = resolve_current_path(new_path, rename_mapping)
rename_mapping[old_path] = mapped_new
except Exception:
# Best effort: stats still work even if rename tracking fails for a commit.
pass
return stats_by_window
def rank_window(stats: Dict[str, FileStats], top_n: int) -> List[Tuple[str, FileStats]]:
"""Sort and trim to top N files for a given window."""
ranked = sorted(
stats.items(),
key=lambda item: (
len(item[1].edited_days),
item[1].commit_touches,
item[0].lower(),
),
reverse=True,
)
return ranked[:top_n]
def format_file_link(path: str) -> str:
"""Format markdown path as an Obsidian wikilink for .md files."""
if path.endswith(".md"):
return f"[[{path[:-3]}]]"
return f"`{path}`"
def build_markdown(
ranked_by_window: Dict[str, List[Tuple[str, FileStats]]],
now: datetime,
type_tags_by_path: Dict[str, List[str]],
generator_path: Path,
) -> str:
"""Build markdown report content."""
lines: List[str] = []
lines.append("---")
lines.append("tags:")
lines.append(" - type/system")
lines.append(f"path (dev): '{generator_path}'")
lines.append("---")
lines.append(f"*Generated: {now.strftime('%Y-%m-%d %H:%M:%S %Z')}*")
for label, _ in WINDOWS:
rows = ranked_by_window.get(label, [])
lines.append(f"## {label}")
# lines.append("")
if not rows:
lines.append("No edits found for this window.")
# lines.append("")
continue
for idx, (path, stat) in enumerate(rows, start=1):
tags = type_tags_by_path.get(path, [])
tag_suffix = f" ({', '.join(tags)})" if tags else ""
lines.append(
f"{idx}. {format_file_link(path)}{tag_suffix} - days edited: {len(stat.edited_days)}, commits touched: {stat.commit_touches}"
)
# lines.append("")
return "\n".join(lines) + "\n"
def main() -> None:
parser = ArgumentParser(
description="Generate top edited files report for several rolling time windows."
)
parser.add_argument("--vault-path", default="vault", help="Path to the vault git repository")
parser.add_argument(
"--output",
default="Most edited files.md",
help="Markdown filename to write inside the vault",
)
parser.add_argument("--top", type=int, default=30, help="Number of files per window")
parser.add_argument(
"--timezone",
default="America/Chicago",
help="Timezone used for date windows",
)
args = parser.parse_args()
workspace_root = Path(__file__).resolve().parents[2]
vault_path_arg = Path(args.vault_path)
vault_path = (
vault_path_arg.resolve()
if vault_path_arg.is_absolute()
else (workspace_root / vault_path_arg).resolve()
)
output_path = vault_path / args.output
if not vault_path.exists():
raise FileNotFoundError(f"Vault path does not exist: {vault_path}")
repo = Repo(vault_path)
if repo.bare:
raise RuntimeError(f"Not a valid git repository: {vault_path}")
tz = resolve_timezone(args.timezone)
now = datetime.now(tz)
stats_by_window = aggregate_stats(
repo=repo,
now=now,
ignore_paths=[args.output],
)
ranked_by_window = {
label: rank_window(stats_by_window[label], args.top) for label, _ in WINDOWS
}
type_tags_by_path = build_type_tag_lookup(vault_path, ranked_by_window)
markdown = build_markdown(
ranked_by_window,
now,
type_tags_by_path,
Path(__file__).resolve(),
)
output_path.write_text(markdown, encoding="utf-8")
print(f"Report written to {output_path}")
if __name__ == "__main__":
main()