⚖️→🔢

Words to Data

Convert Legal Documents Into Diffable Data Structures

Features

📄

Parse Legal Documents

Extract hierarchical structure from US Code titles and Public Laws in USLM XML format with full structural preservation.

📊

Hierarchical Diffing

Compute word-level differences between document versions to precisely track changes over time.

Parallel Processing

Parse multiple documents concurrently using Rayon for blazing-fast performance.

📦

JSON Serialization

All data structures implement Serde traits for easy integration with any system.

🔍

Bill Amendment Extraction

Automatically identify USC references and amending actions from bills to track legislative changes.

🐍

Python Bindings

Full API access from Python with ergonomic bindings via PyO3. Available on PyPI.

And More

We're actively working on legal-specific diff algorithms, enhanced bill parsing, pre-built datasets, and congressional vote tracking.

View Full Roadmap

Installation

Rust

TOML
[dependencies]
words-to-data = "0.1.2"

Python

Bash
pip install words-to-data

Pre-built wheels available for Linux x86_64. Other platforms build from source.

Quick Start Examples

You can download these XML files directly here:

Parse a US Code Document

use words_to_data::uslm::parser::parse;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Load a USCode Title
    let title_26 = parse("tests/test_data/usc/2025-07-18/usc26.xml", "2025-07-18")?;

    // Navigate to §174(a)
    let s174a = title_26.find("uscodedocument_26/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a").expect("§174 (a) not found");

    // Print the chapeau value
    println!(
        "§ 174(a) chapeau: {}",
        s174a.data.chapeau.clone().unwrap_or("<Empty>".to_string())
    );

    // Serialize
    words_to_data::utils::write_json_file(&title_26, "title_26.json")?;
    Ok(())
}
from words_to_data import parse_uslm_xml

title_26 = parse_uslm_xml("tests/test_data/usc/2025-07-18/usc26.xml", "2025-07-18")
s174a = title_26.find("uscodedocument_26/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a")
print(f"§ 174(a) chapeau: {s174a.data['chapeau']}")

output:

§ 174(a) chapeau: In the case of a taxpayer's specified research or experimental expenditures for any taxable year—
                
                
# JSON Serialized data
"data": {
    "path": "uscodedocument_26/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a",
    "element_type": "subsection",
    "document_type": {
        "us_code": {
        "usc_type": "title"
        }
    },
    "date": [
        2025,
        199
    ],
    "number_value": "a",
    "number_display": "(a)",
    "verbose_name": " Title 26— Subtitle A— CHAPTER 1— Subchapter B— PART VI— § 174. (a)",
    "heading": " In general",
    "chapeau": "In the case of a taxpayer's specified research or experimental expenditures for any taxable year—",
    "proviso": null,
    "content": null,
    "continuation": null,
    "uslm_id": "/us/usc/t26/s174/a",
    "uslm_uuid": "ida60c2e6d-e7ce-11ef-b868-e6270ec5930c",
    "source_credits": []
    },
    "children": [
    {
        "data": {
        "path": "uscodedocument_26/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a/paragraph_1",
    ...

                

Compute a Diff Between Versions

use words_to_data::{diff::TreeDiff, uslm::parser::parse};

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let doc_old = parse("tests/test_data/usc/2025-07-18/usc26.xml", "2025-07-18")?;
    let doc_new = parse("tests/test_data/usc/2025-07-30/usc26.xml", "2025-07-30")?;

    let diff = TreeDiff::from_elements(&doc_old, &doc_new);

    let s174a_diff = diff.find("uscodedocument_26/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a").expect("Section 174A has no changes, nor does its children!");

    for change in s174a_diff.changes.iter() {
        println!("{:#?} Changed:", change.field_name);
        println!("  Old: {}", change.old_value);
        println!("  New: {}", change.new_value);
        println!("  Number of word-level changes: {}", change.changes.len());
    }
    words_to_data::utils::write_json_file(&diff, "diff.json")?;
    Ok(())
}
from words_to_data import parse_uslm_xml, compute_diff

doc_old = parse_uslm_xml("tests/test_data/usc/2025-07-18/usc26.xml", "2025-07-18")
doc_new = parse_uslm_xml("tests/test_data/usc/2025-07-30/usc26.xml", "2025-07-30")

diff = compute_diff(doc_old, doc_new)

s174a_diff = diff.find("uscodedocument_26/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a")

for change in s174a_diff.changes:
    print(f"{change.field_name} Changed:")
    print(f"  Old: {change.old_value}")
    print(f"  New: {change.new_value}")
    print(f"  Number of word-level changes: {len(change.changes)}")

output:

Chapeau Changed
  Old: In the case of a taxpayer's specified research or experimental expenditures for any taxable year—
  New: In the case of a taxpayer's foreign research or experimental expenditures for any taxable year—
  Number of word-level changes: 2
                    
                
# JSON
{
    "root_path": "uscodedocument_26/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a",
    "changes": [
    {
        "field_name": "chapeau",
        "from_date": [
        2025,
        199
        ],
        "to_date": [
        2025,
        211
        ],
        "old_value": "In the case of a taxpayer's specified research or experimental expenditures for any taxable year—",
        "new_value": "In the case of a taxpayer's foreign research or experimental expenditures for any taxable year—",
        "changes": [
        {
            "value": "specified",
            "old_index": 12,
            "new_index": null,
            "tag": "delete"
        },
        {
            "value": "foreign",
            "old_index": null,
            "new_index": 12,
            "tag": "insert"
        }
        ...
                

Extract Amendments from a Bill

use words_to_data::uslm::bill_parser::parse_bill_amendments;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let data = parse_bill_amendments("tests/test_data/bills/hr-119-21.xml")?;

    println!(
        "Bill {}: {} amendments found",
        data.bill_id,
        data.amendments.len()
    );

    for amendment in &data.amendments {
        println!("\nAmendment at: {}", amendment.source_path);
        println!("  USC sections modified: {}", amendment.target_paths.len());
        println!("  Actions: {:?}", amendment.action_types);
    }

    Ok(())
}
from words_to_data import parse_bill_amendments

data = parse_bill_amendments("tests/test_data/bills/hr-119-21.xml")

print(f"Bill {data.bill_id}: {len(data.amendments)} amendments found")

for amendment in data.amendments:
    print(f"\nAmendment at: {amendment.source_path}")
    print(f"  USC sections modified: {len(amendment.target_paths)}")
    print(f"  Actions: {amendment.action_types}")

    for ref in amendment.target_paths:
        print(f"    - {ref.display_text} ({ref.path})")

output:

Bill 119-21: 603 amendments found

Amendment at: /us/pl/119/21/tI/stA/s10101/a
  USC sections modified: 1
  Actions: [Amend, Delete, Insert]
    - 7 U.S.C. 2012 (/us/usc/t7/s2012)

Amendment at: /us/pl/119/21/tI/stA/s10101/b/1
  USC sections modified: 1
...
                    
                
# JSON
{
  "data": {
    "path": "publiclawdocument_119-21",
    "element_type": {
      "public_law_document": {
        "amendments": [
          {
            "action_types": [
              "amend",
              "delete",
              "insert"
            ],
            "target_paths": [
              {
                "path": "/us/usc/t7/s2012",
                "display_text": "7 U.S.C. 2012"
              }
            ],
            "source_path": "/us/pl/119/21/tI/stA/s10101/a"
          },
          {
        ...
                

Documentation & Resources

🔧

GitHub Repository

Source code, issue tracking for the Words to Data project.

View Source
📦

Crates.io

Download the latest version and view version history on the official Rust package registry.

View Crate

Get in Touch

Have questions, feedback, or partnership opportunities? We'd love to hear from you.

📧
contact@wordstodata.com

For technical support, please open an issue on GitHub