Words to Data - Convert Legal Documents Into Diffable Data Structures

Quick Start Examples

You can download these XML files directly here:

Parse a US Code Document

use words_to_data::uslm::parser::parse;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Load a USCode Title
    let title_26 = parse("tests/test_data/usc/2025-07-18/usc26.xml", "2025-07-18")?;

    // Navigate to §174(a)
    let s174a = title_26.find("uscode/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a").expect("§174 (a) not found");

    // Print the chapeau value
    println!(
        "§ 174(a) chapeau: {}",
        s174a.data.chapeau.clone().unwrap_or("<Empty>".to_string())
    );

    // Serialize
    words_to_data::utils::write_json_file(&title_26, "title_26.json")?;
    Ok(())
}

from words_to_data import parse_uslm_xml

title_26 = parse_uslm_xml("tests/test_data/usc/2025-07-18/usc26.xml", "2025-07-18")
s174a = title_26.find("uscode/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a")
print(f"§ 174(a) chapeau: {s174a.data['chapeau']}")

output:

§ 174(a) chapeau: In the case of a taxpayer’s specified research or experimental expenditures for any taxable year—
                
                
# JSON Serialized data
"data": {
    "path": "uscode/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a",
    "element_type": "subsection",
    "document_type": {
        "us_code": {
        "usc_type": "title"
        }
    },
    "date": [
        2025,
        199
    ],
    "number_value": "a",
    "number_display": "(a)",
    "verbose_name": " Title 26— Subtitle A— CHAPTER 1— Subchapter B— PART VI— § 174. (a)",
    "heading": " In general",
    "chapeau": "In the case of a taxpayer’s specified research or experimental expenditures for any taxable year—",
    "proviso": null,
    "content": null,
    "continuation": null,
    "uslm_id": "/us/usc/t26/s174/a",
    "uslm_uuid": "ida60c2e6d-e7ce-11ef-b868-e6270ec5930c",
    "source_credits": []
    },
    "children": [
    {
        "data": {
        "path": "uscode/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a/paragraph_1",
    ...

Compute a Diff Between Versions

use words_to_data::{diff::TreeDiff, uslm::parser::parse};

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let doc_old = parse("tests/test_data/usc/2025-07-18/usc26.xml", "2025-07-18")?;
    let doc_new = parse("tests/test_data/usc/2025-07-30/usc26.xml", "2025-07-30")?;

    let diff = TreeDiff::from_elements(&doc_old, &doc_new);

    let s174a_diff = diff.find("uscode/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a").expect("Section 174A has no changes, nor does its children!");

    for change in s174a_diff.changes.iter() {
        println!("{:#?} Changed:", change.field_name);
        println!("  Old: {}", change.old_value);
        println!("  New: {}", change.new_value);
        println!("  Number of word-level changes: {}", change.changes.len());
    }
    words_to_data::utils::write_json_file(&diff, "diff.json")?;
    Ok(())
}

from words_to_data import parse_uslm_xml, compute_diff

doc_old = parse_uslm_xml("tests/test_data/usc/2025-07-18/usc26.xml", "2025-07-18")
doc_new = parse_uslm_xml("tests/test_data/usc/2025-07-30/usc26.xml", "2025-07-30")

diff = compute_diff(doc_old, doc_new)

s174a_diff = diff.find("uscode/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a")

for change in s174a_diff.changes:
    print(f"{change.field_name} Changed:")
    print(f"  Old: {change.old_value}")
    print(f"  New: {change.new_value}")
    print(f"  Number of word-level changes: {len(change.changes)}")

output:

Chapeau Changed
  Old: In the case of a taxpayer’s specified research or experimental expenditures for any taxable year—
  New: In the case of a taxpayer’s foreign research or experimental expenditures for any taxable year—
  Number of word-level changes: 2
                    
                
# JSON
{
    "root_path": "uscode/title_26/subtitle_A/chapter_1/subchapter_B/part_VI/section_174/subsection_a",
    "changes": [
    {
        "field_name": "chapeau",
        "from_date": [
        2025,
        199
        ],
        "to_date": [
        2025,
        211
        ],
        "old_value": "In the case of a taxpayer’s specified research or experimental expenditures for any taxable year—",
        "new_value": "In the case of a taxpayer’s foreign research or experimental expenditures for any taxable year—",
        "changes": [
        {
            "value": "specified",
            "old_index": 12,
            "new_index": null,
            "tag": "delete"
        },
        {
            "value": "foreign",
            "old_index": null,
            "new_index": 12,
            "tag": "insert"
        }
        ...

Extract Amendments from a Bill

use words_to_data::uslm::bill_parser::parse_bill_amendments;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let data = parse_bill_amendments("119-21", "tests/test_data/bills/119-hr-1/bill_119_hr_1.xml")?;

    println!(
        "Bill {}: {} amendments found",
        data.bill_id,
        data.amendments.len()
    );

    for amendment in data.amendments.values() {
        println!("\nAmendment at: {}", amendment.source_path);
        println!("  USC sections modified: {}", amendment.target_paths.len());
        println!("  Actions: {:?}", amendment.action_types);
    }

    Ok(())
}

from words_to_data import parse_bill_amendments

data = parse_bill_amendments("119-21", "tests/test_data/bills/119-hr-1/bill_119_hr_1.xml")

print(f"Bill {data.bill_id}: {len(data.amendments)} amendments found")

for amendment in data.amendments.values():
    print(f"\nAmendment at: {amendment.source_path}")
    print(f"  USC sections modified: {len(amendment.target_paths)}")
    print(f"  Actions: {amendment.action_types}")

    for ref in amendment.target_paths:
        print(f"    - {ref.display_text} ({ref.path})")

output:

Bill 119-21: 603 amendments found

Amendment at: /us/pl/119/21/tI/stA/s10101/a
  USC sections modified: 1
  Actions: [Amend, Delete, Insert]
    - 7 U.S.C. 2012 (/us/usc/t7/s2012)

Amendment at: /us/pl/119/21/tI/stA/s10101/b/1
  USC sections modified: 1
...
                    
                
# JSON
{
  "data": {
    "path": "publiclawdocument_119-21",
    "element_type": {
      "public_law_document": {
        "amendments": [
          {
            "action_types": [
              "amend",
              "delete",
              "insert"
            ],
            "target_paths": [
              {
                "path": "/us/usc/t7/s2012",
                "display_text": "7 U.S.C. 2012"
              }
            ],
            "source_path": "/us/pl/119/21/tI/stA/s10101/a"
          },
          {
        ...

⚖️→🔢

Convert Legal Documents Into Diffable Data Structures

Features

Parse Legal Documents

Hierarchical Diffing

Parallel Processing

JSON Serialization

Bill Amendment Extraction

Python Bindings

Dataset Management

And More

Installation

Rust

Python

Quick Start Examples

Parse a US Code Document

output:

Compute a Diff Between Versions

output:

Extract Amendments from a Bill

output:

Documentation & Resources

GitHub Repository

Crates.io

Get in Touch