Refactor Line

Previous definition of `Line` type was obnoxious and too difficult to work with. This one should make iterator implementation much easier and has clearer structure on its own.
2025-08-06 23:07:44 +07:00 · 2025-08-06 23:07:44 +07:00 · 579c2a4d3d
commit 579c2a4d3d
parent 4b9d6a6adb
2 changed files with 132 additions and 66 deletions
--- a/rottlib/src/lexer/debug_tools.rs
+++ b/rottlib/src/lexer/debug_tools.rs
@ -7,8 +7,6 @@
 //!
 //! These checks have been moved to the parent module.

-use super::Line;
-
 /// A technical trait that adds debug helpers to the lexer.
 pub trait DebugTools {
    /// Pretty-prints the internal layout of the tokenised file - useful when
@ -31,62 +29,55 @@ pub trait DebugTools {

 impl<'src> DebugTools for super::TokenizedFile<'src> {
    fn reconstruct_source(&self) -> String {
-        let mut result = String::new();
-        for line in &self.lines {
-            if let Line::Standalone(token_range) | Line::SpannedWithTokens(_, token_range) = line {
-                for span in &self.buffer[token_range.clone()] {
-                    result.push_str(span.lexeme);
-                }
-            }
-        }
-        result
+        self.buffer.iter().map(|span| span.lexeme).collect()
    }

    fn dump_debug_layout(&self) {
-        for (row_index, line) in self.lines.iter().enumerate() {
-            println!("Line {}", row_index + 1);
-            match line {
-                Line::Standalone(token_range) => {
+        for (row_idx, line) in self.lines.iter().enumerate() {
+            println!("Line {}", row_idx + 1);
+
+            match (line.continued_from, line.local_range()) {
+                // Stand-alone line (all tokens start here)
+                (None, Some(range)) => {
                    println!("\t[Standalone]");
-                    let mut column_utf16 = 0usize;
-                    for next_token_span in &self.buffer[token_range.clone()] {
-                        let token_beginning = column_utf16;
-                        let token_end = column_utf16 + next_token_span.length_utf16;
-                        println!(
-                            "\t\t{:?} @ {}-{}: {:?}",
-                            next_token_span.token,
-                            token_beginning,
-                            token_end,
-                            next_token_span.lexeme
-                        );
-                        column_utf16 = token_end;
-                    }
+                    dump_spans(&self.buffer[range.clone()]);
                }
-                Line::Spanned(origin_row) => {
-                    // `origin_row` is 0-based
+
+                // Pure continuation - the only thing on this line is
+                // the remainder of a multi-line token that started earlier.
+                (Some(origin_row), None) => {
                    println!(
-                        "\t[Continued from line {} - no new tokens here]",
+                        "\t[Continued from line {} – no new tokens here]",
                        origin_row + 1
                    );
                }
-                Line::SpannedWithTokens(origin_row, token_range) => {
-                    // `origin_row` is 0-based
+
+                // Continuation **plus** some fresh tokens that begin here.
+                (Some(origin_row), Some(range)) => {
                    println!("\t[Continued from line {} + new tokens]", origin_row + 1);
-                    let mut column_utf16 = 0usize;
-                    for next_token_span in &self.buffer[token_range.clone()] {
-                        let token_beginning = column_utf16;
-                        let token_end = column_utf16 + next_token_span.length_utf16;
-                        println!(
-                            "\t\t{:?} @ {}-{}: {:?}",
-                            next_token_span.token,
-                            token_beginning,
-                            token_end,
-                            next_token_span.lexeme
-                        );
-                        column_utf16 = token_end;
-                    }
+                    dump_spans(&self.buffer[range.clone()]);
+                }
+
+                // An empty physical line (should be rare, but let's be safe).
+                (None, None) => {
+                    println!("\t[Empty line]");
                }
            }
        }
    }
 }
+
+/// Helper that prints every span in `spans` together with its UTF-16
+/// column boundaries.
+fn dump_spans<'a>(spans: &[super::TokenPiece<'a>]) {
+    let mut col_utf16 = 0usize;
+    for span in spans {
+        let start = col_utf16;
+        let end = start + span.length_utf16;
+        println!(
+            "\t\t{:?} @ {}–{}: {:?}",
+            span.token, start, end, span.lexeme
+        );
+        col_utf16 = end;
+    }
+}
--- a/rottlib/src/lexer/mod.rs
+++ b/rottlib/src/lexer/mod.rs
@ -30,7 +30,7 @@
 mod debug_tools;
 mod lexing;

-use std::ops::Range;
+use std::{cmp::Ordering, ops::Range};

 use logos::Logos;

@ -46,27 +46,36 @@ const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000;
 ///
 /// *No absolute coordinates* are stored - they are recomputed per line.
 #[derive(Debug, Clone, Copy)]
-struct TokenSpan<'src> {
-    lexeme: &'src str,
-    token: Token,
-    length_utf16: usize,
+pub struct TokenSpan<'src> {
+    pub lexeme: &'src str,
+    pub token: Token,
+    pub length_utf16: usize,
 }

+/// Defines location of a token inside [`TokenizedFile`] in a way, convenient
+/// for communicating through LSP.
+#[derive(Eq, Clone, Copy)]
+pub struct TokenLocation {
+    line_number: usize,
+    column: usize,
+}
+
+/// Type for indexing lines in a [`TokenizedFile`].
+type LineNumber = usize;
+
+/// Type for specific tokens inside each [`Line`].
+type TokenIndex = usize;
+
 /// Representation of a single physical line of the source file.
 ///
-/// [`Range<usize>`] are used instead of slices to avoid creating
+/// [`Range<TokenIndex>`] are used instead of slices to avoid creating
 /// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
 #[derive(Clone)]
-enum Line {
-    /// A standalone line that owns a contiguous slice in
-    /// the [`TokenizedFile::buffer`] arena.
-    Standalone(Range<usize>),
-    /// A 0-based line that is part of a multi-line token started on
-    /// another line.
-    Spanned(usize),
-    /// A 0-based line that is part of a multi-line token started on
-    /// another line *and* contains additional tokens local to itself.
-    SpannedWithTokens(usize, Range<usize>),
+struct Line {
+    /// Token that began on an earlier line (`None` for standalone lines).
+    continued_from: Option<LineNumber>,
+    /// Contiguous tokens that started on this line (`start >= end` iff empty).
+    local_range: Range<TokenIndex>,
 }

 /// A tokenized, lossless representation of an UnrealScript source file.
@ -175,7 +184,7 @@ impl<'src> Tokenizer<'src> {
            // exactly `1` interior line)
            let insert_count = newline_count - 1;
            for _ in 0..insert_count {
-                self.lines.push(Line::Spanned(start_line));
+                self.lines.push(Line::spanned(start_line));
            }
            // This is called *after* `commit_current_line()` cleared previous
            // stored value
@ -200,9 +209,9 @@ impl<'src> Tokenizer<'src> {
            // `commit_current_line()` only applies it once.
            // This guarantees no "bleed" between adjacent multi-line tokens.
            if let Some(from) = self.multi_line_start.take() {
-                self.lines.push(Line::SpannedWithTokens(from, slice));
+                self.lines.push(Line::spanned_with_tokens(from, slice));
            } else {
-                self.lines.push(Line::Standalone(slice));
+                self.lines.push(Line::standalone(slice));
            }
            self.slice_start_index = slice_end;
        }
@ -216,7 +225,7 @@ impl<'src> Tokenizer<'src> {
        // If we still have a `spanned_from` (i.e. a pure multi-line token with
        // no local tokens on its last line), push a bare `Spanned` entry.
        if let Some(from) = self.multi_line_start.take() {
-            self.lines.push(Line::Spanned(from));
+            self.lines.push(Line::spanned(from));
        }

        // Optimize for size
@ -251,6 +260,72 @@ fn token_can_span_lines(token: &Token) -> bool {
    )
 }

+impl Line {
+    /// Creates a standalone line that owns a contiguous slice in
+    /// the [`TokenizedFile::buffer`] arena.
+    fn standalone(locals: Range<TokenIndex>) -> Line {
+        Line {
+            continued_from: None,
+            local_range: locals,
+        }
+    }
+
+    /// Creates a line that is part of a multi-line token started on
+    /// another line, referencing the 0-based index of its origin.
+    fn spanned(carried: LineNumber) -> Line {
+        Line {
+            continued_from: Some(carried),
+            local_range: 0..0,
+        }
+    }
+
+    /// Creates a line that is part of a multi-line token started on
+    /// another line and also contains additional tokens local to itself.
+    fn spanned_with_tokens(carried: LineNumber, locals: Range<TokenIndex>) -> Line {
+        Line {
+            continued_from: Some(carried),
+            local_range: locals,
+        }
+    }
+
+    /// Returns a range of tokens inside [`TokenizedFile::buffer`] that start
+    /// on this line.
+    ///
+    /// [`None`] means there is no such tokens. Otherwise range is guaranteed
+    /// to not be empty.
+    fn local_range(&self) -> Option<Range<TokenIndex>> {
+        if self.local_range.is_empty() {
+            None
+        } else {
+            Some(self.local_range.clone())
+        }
+    }
+
+    /// Returns amount of tokens of the line.
+    ///
+    /// Counts both tokens that started on this line and tokens that continued
+    /// from previous one.
+    fn len(&self) -> usize {
+        (self.continued_from.is_some() as usize) + (self.local_range.end - self.local_range.start)
+    }
+}
+
+impl PartialEq for TokenLocation {
+    fn eq(&self, other: &TokenLocation) -> bool {
+        self.line_number == other.line_number && self.column == other.column
+    }
+}
+
+impl PartialOrd for TokenLocation {
+    fn partial_cmp(&self, other: &TokenLocation) -> Option<Ordering> {
+        if self.line_number == other.line_number {
+            self.column.partial_cmp(&other.column)
+        } else {
+            self.line_number.partial_cmp(&other.line_number)
+        }
+    }
+}
+
 /// Counts the number of new lines in given text.
 fn count_newlines(text: &str) -> usize {
    let mut bytes_iterator = text.as_bytes().iter().peekable();