From 933722bd42393b886683453d88ef619cea1b3ae3 Mon Sep 17 00:00:00 2001 From: dkanus Date: Thu, 7 Aug 2025 17:24:35 +0700 Subject: [PATCH] Add iterator over tokens to TokenizedFile --- rottlib/src/lexer/iterator.rs | 191 ++++++++++++++++++++++++++++++++++ rottlib/src/lexer/lexing.rs | 24 +++++ rottlib/src/lexer/mod.rs | 165 ++++++++++++++++------------- 3 files changed, 306 insertions(+), 74 deletions(-) create mode 100644 rottlib/src/lexer/iterator.rs diff --git a/rottlib/src/lexer/iterator.rs b/rottlib/src/lexer/iterator.rs new file mode 100644 index 0000000..d990128 --- /dev/null +++ b/rottlib/src/lexer/iterator.rs @@ -0,0 +1,191 @@ +//! Sub-module that adds an iterator to [`TokenizedFile`] which yields tokens in +//! the order they appear in the source code. +//! +//! ## Examples +//! +//! ```rust +//! let iter = TokenizedFile::from_str("0 / 0").tokens().without_whitespace(); +//! ``` +//! +//! ## Terminology: continued tokens +//! +//! Some [`super::Token`]s (e.g. [`super::Token::CppText`] or +//! [`super::Token::BlockComment`] can span multiple lines and are recorded on +//! every line on which they appear (usually as the first, and sometimes +//! the only, token). +//! In this module these are referred to as "continued" or +//! "carried-over" tokens. +//! Since our iterator needs to return each token only once, we take special +//! care to skip such continued tokens during iteration. + +use super::{TokenLocation, TokenPiece, TokenizedFile}; + +/// An immutable iterator over all tokens in a [`TokenizedFile`], preserving +/// their order of appearance in the original source file. +/// +/// After exhaustion it keeps returning [`None`]. +#[must_use] +#[derive(Clone, Debug)] +pub struct Tokens<'src> { + /// [`TokenLocation`] of the next token to be returned. + /// + /// [`None`] means the iterator has been exhausted. + cursor: Option, + /// [`TokenizedFile`] whose tokens we're iterating over. + source_file: &'src TokenizedFile<'src>, + /// When `true`, whitespace tokens are skipped. + skip_whitespace: bool, +} + +// Because we can only return [`None`] after we've returned it once. +impl<'src> std::iter::FusedIterator for Tokens<'src> {} + +impl<'src> Tokens<'src> { + /// Makes the iterator skip all whitespace tokens. + #[must_use] + #[inline] + pub fn without_whitespace(mut self) -> Self { + self.skip_whitespace = true; + self + } + + // Returns the position of the next new token, skipping carried-over pieces + // and blank lines. + fn advance_position(&self, mut position: TokenLocation) -> Option { + if let Some(current_line) = self.source_file.lines.get(position.line) { + // `Line::len()` also counts a possible token that continued from + // the previous line. + if position.column + 1 < current_line.len() { + position.column += 1; + return Some(position); + } + } + // Current line is exhausted: walk downward until we find the first line + // that **owns local tokens**, because we only want *new* token, + // not continued from previous lines (they were already iterated over). + position.line += 1; + while let Some(next_line) = self.source_file.lines.get(position.line) { + if next_line.local_range().is_some() { + // Start at the first *local* token, + // skipping any carried-over one + position.column = if next_line.continued_from.is_some() { + 1 + } else { + 0 + }; + return Some(position); + } + position.line += 1; // keep skipping empty / pure-carried lines + } + // No more tokens. + None + } + + // Creates a new iterator. + fn new(source_file: &'src TokenizedFile) -> Tokens<'src> { + let mut new_iterator = Tokens { + source_file, + cursor: Some(TokenLocation { line: 0, column: 0 }), + skip_whitespace: false, + }; + // We need to land on the first existing token so [`Iterator::next`] + // can assume cursor is valid. + while let Some(token_position) = new_iterator.cursor { + if new_iterator.source_file.get(token_position).is_some() { + break; + } + new_iterator.cursor = new_iterator.advance_position(token_position); + } + new_iterator + } +} + +impl<'src> Iterator for Tokens<'src> { + type Item = (TokenLocation, TokenPiece<'src>); + + fn next(&mut self) -> Option { + // We only ever loop to discard whitespaces when the flag is on + loop { + let current_cursor = self.cursor?; + let token_piece = *self.source_file.get(current_cursor)?; + self.cursor = self.advance_position(current_cursor); + + // Optional whitespace-skip + if !self.skip_whitespace || !token_piece.token.is_whitespace() { + return Some((current_cursor, token_piece)); + } + } + } +} + +impl<'src> TokenizedFile<'src> { + // Returns the final local token in `line_number` + // (used to resolve column 0 of a continued line). + fn last_piece_in_line(&self, line_number: usize) -> Option<&TokenPiece> { + self.lines + .get(line_number) + .and_then(|line| line.local_range()) + // `Line::local_range()` is guaranteed to return non-empty `Range`. + .and_then(|range| self.buffer.get(range.end - 1)) + } + + /// Returns [`TokenPiece`] at a given location if it exists. + /// + /// If the line specified by [`TokenLocation`] starts with a token that + /// continues from the previous line - column `0` refers to that token. + /// + /// Never panics, invalid position returns [`None`]. + /// + /// ## Examples + /// + /// ```rust + /// use mycrate::{TokenizedFile, TokenLocation, Token}; + /// let file = TokenizedFile::from_str("0 / 0"); + /// assert_eq!( + /// file.get(TokenLocation { line: 0, column: 2 }).map(|p| p.token), + /// Some(Token::Divide), + /// ); + /// ``` + #[track_caller] + pub fn get(&self, position: TokenLocation) -> Option<&TokenPiece> { + let line = self.lines.get(position.line)?; + let column = position.column; + if column >= line.len() { + return None; + } + if let Some(spanned_line_number) = line.continued_from + && column == 0 + { + self.last_piece_in_line(spanned_line_number) + } else { + // If we have a token that continued from the previous line, + // then, relative to `self.buffer`, our `column` is actually 1-based + // and we need to shift it back to being 0-based. + let token_position = + line.local_range.start + column - if line.continued_from.is_some() { 1 } else { 0 }; + self.buffer.get(token_position) + } + } + + /// Returns an iterator over all contained tokens in the order they appear + /// in the original source file. + /// + /// By default includes all tokens, including whitespace and comments. + /// + /// Returns the same iterator as [`TokenizedFile::into_iter`] + #[must_use] + #[inline] + pub fn tokens(&'src self) -> Tokens<'src> { + Tokens::new(self) + } +} + +impl<'src> IntoIterator for &'src TokenizedFile<'src> { + type Item = (TokenLocation, TokenPiece<'src>); + type IntoIter = Tokens<'src>; + + #[inline] + fn into_iter(self) -> Self::IntoIter { + self.tokens() + } +} diff --git a/rottlib/src/lexer/lexing.rs b/rottlib/src/lexer/lexing.rs index d70f54f..fea1ff4 100644 --- a/rottlib/src/lexer/lexing.rs +++ b/rottlib/src/lexer/lexing.rs @@ -364,6 +364,30 @@ pub enum Token { Error, } +impl Token { + /// Returns `true` if this token is a newline (`Token::NewLine`). + pub fn is_newline(&self) -> bool { + matches!(self, Token::NewLine) + } + + /// Returns `true` if this token is trivia whitespace + /// (`Token::Whitespace` or `Token::NewLine`). + /// + /// Note: comments are **not** considered whitespace. + pub fn is_whitespace(&self) -> bool { + matches!(&self, Token::Whitespace | Token::NewLine) + } + + /// Returns `true` if this token may span multiple physical lines + /// (i.e. can contain newline characters). + pub fn can_span_lines(&self) -> bool { + matches!( + self, + Token::BlockComment | Token::Brace(BraceKind::CppBlock) | Token::Error + ) + } +} + /// Consume a /* ... */ block comment with arbitrary nesting /// (like UnrealScript allows). /// diff --git a/rottlib/src/lexer/mod.rs b/rottlib/src/lexer/mod.rs index f696f7f..8c8d3da 100644 --- a/rottlib/src/lexer/mod.rs +++ b/rottlib/src/lexer/mod.rs @@ -12,30 +12,37 @@ //! precompute lengths of each token in that encoding, making interfacing //! easier. //! +//! ## Iteration over tokens +//! +//! For simplicity we've moved out code for iterating over tokens of +//! [`TokenizedFile`] into a separate submodule [`iterator`]. +//! //! ## Opt-in debug helpers //! //! Extra diagnostics become available in **debug builds** or when the crate is -//! compiled with `debug` feature enabled. They live in the [`DebugTools`] +//! compiled with `debug` feature enabled. They live in the [`debug_tools`] //! extension trait, implemented for [`TokenizedFile`]. //! //! ``` //! // bring the trait into scope //! use lexer::DebugTools; //! -//! let file = TokenizedFile::from_source(src); +//! let file = TokenizedFile::from_str(src); //! file.debug_dump(); // pretty-print token layout //! let text = file.to_source(); // reconstruct original text //! ``` mod debug_tools; +mod iterator; mod lexing; -use std::{cmp::Ordering, ops::Range}; +use std::ops::Range; use logos::Logos; #[cfg(any(debug_assertions, feature = "debug"))] pub use debug_tools::DebugTools; +pub use iterator::Tokens; pub use lexing::Token; /// Empirically chosen starting size for token buffer (used during tokenization) @@ -45,44 +52,31 @@ const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000; /// A slice tagged with its token kind plus two length counters. /// /// *No absolute coordinates* are stored - they are recomputed per line. -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Hash, Clone, Copy, PartialEq, Eq)] pub struct TokenPiece<'src> { - pub lexeme: &'src str, + /// Token, represented by this [`TokenPiece`]. pub token: Token, + /// Underlying text that was lexed as the corresponding token. + pub lexeme: &'src str, + /// Length of the token in UTF-16 code units for the needs of easy seeking + /// using given LSP cursor coordinates (line + UTF-16 offset). + /// Precomputed for convenience. pub length_utf16: usize, } /// Defines location of a token inside [`TokenizedFile`] in a way, convenient /// for communicating through LSP. -#[derive(Eq, Clone, Copy)] +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct TokenLocation { - line_number: usize, - column: usize, + /// 0-based line number. + pub line: usize, + /// 0-based index of a token in the line, possibly including the token that + /// has continued from the previous line. + pub column: usize, } -impl PartialEq for TokenLocation { - fn eq(&self, other: &TokenLocation) -> bool { - self.line_number == other.line_number && self.column == other.column - } -} - -impl PartialOrd for TokenLocation { - fn partial_cmp(&self, other: &TokenLocation) -> Option { - if self.line_number == other.line_number { - self.column.partial_cmp(&other.column) - } else { - self.line_number.partial_cmp(&other.line_number) - } - } -} - -/// Type for indexing lines in a [`TokenizedFile`]. -type LineNumber = usize; - -/// Type for specific tokens inside each [`Line`]. -type TokenIndex = usize; - /// A tokenized, lossless representation of an UnrealScript source file. +#[derive(Debug)] pub struct TokenizedFile<'src> { /// Arena of every token span in this file. buffer: Vec>, @@ -94,6 +88,9 @@ pub struct TokenizedFile<'src> { } /// Mutable state that encapsulates data needed during the tokenization loop. +/// +/// Access to stored tokens is provided through the [`iterator::Tokens`] +/// iterator. struct Tokenizer<'src> { /// Arena that owns every [`TokenPiece`] produced for the file. buffer: Vec>, @@ -101,7 +98,7 @@ struct Tokenizer<'src> { lines: Vec, /// The current 0-based physical line number. line_number: usize, - /// Index in [`Tokenizer::buffer`] where the current line starts. + /// Index in [`Tokenizer::buffer`] where the current *line* starts. slice_start_index: usize, /// When a multi-line token is being scanned, stores the 0-based line /// on which it started; [`None`] otherwise. @@ -112,25 +109,43 @@ struct Tokenizer<'src> { impl<'src> TokenizedFile<'src> { /// Tokenize `source` and return a fresh [`TokenizedFile`]. - pub fn from_source(source: &'src str) -> TokenizedFile<'src> { - let mut tokenizer = TokenizedFile::<'src>::builder(); + /// + /// ## Examples + /// + /// ```rust + /// let source_text = "2 + 2 * 2".to_string(); + /// let tokenized_file = TokenizedFile::from_str(&source_text); + /// ``` + #[must_use] + pub fn from_str(source: &'src str) -> TokenizedFile<'src> { + let mut tokenizer = Self::builder(); let mut lexer = Token::lexer(source); - // Logos > Ok() > token > token span <- plugged into tokenizer while let Some(token_result) = lexer.next() { + // Add `Token:Error` manually, since Logos won't do it for us. let token = token_result.unwrap_or_else(|_| { tokenizer.had_errors = true; Token::Error }); - let token_span = build_span(token, lexer.slice()); - tokenizer.process_token_span(token_span); + let token_piece = make_token_piece(token, lexer.slice()); + tokenizer.process_token_piece(token_piece); } tokenizer.into_tokenized_file() } /// Returns [`true`] if any erroneous tokens were produced during building /// of this [`TokenizedFile`]. - pub fn had_errors(&self) -> bool { + /// + /// ## Examples + /// + /// ```rust + /// let tokenized_file = TokenizedFile::from_str("function test() {}"); + /// if tokenized_file.has_errors() { + /// println!("Error while parsing file: {}", path.display()); + /// } + /// ``` + #[inline] + pub fn has_errors(&self) -> bool { self.had_errors } @@ -147,22 +162,29 @@ impl<'src> TokenizedFile<'src> { } } +/// Type for indexing lines in a [`TokenizedFile`]. +type LineIdx = usize; + +/// Type for specific tokens inside each [`Line`]. +type TokenIdx = usize; + /// Representation of a single physical line of the source file. /// /// [`Range`] are used instead of slices to avoid creating /// a self-referential struct (with [`TokenizedFile`]), which rust forbids. -#[derive(Clone)] +#[derive(Clone, Debug, Hash, PartialEq, Eq)] struct Line { /// Token that began on an earlier line (`None` for standalone lines). - continued_from: Option, + continued_from: Option, /// Contiguous tokens that started on this line (`start >= end` iff empty). - local_range: Range, + local_range: Range, } impl Line { /// Creates a standalone line that owns a contiguous slice in /// the [`TokenizedFile::buffer`] arena. - fn standalone(locals: Range) -> Line { + #[inline] + fn standalone(locals: Range) -> Line { Line { continued_from: None, local_range: locals, @@ -171,7 +193,8 @@ impl Line { /// Creates a line that is part of a multi-line token started on /// another line, referencing the 0-based index of its origin. - fn spanned(carried: LineNumber) -> Line { + #[inline] + fn spanned(carried: LineIdx) -> Line { Line { continued_from: Some(carried), local_range: 0..0, @@ -180,7 +203,8 @@ impl Line { /// Creates a line that is part of a multi-line token started on /// another line and also contains additional tokens local to itself. - fn spanned_with_tokens(carried: LineNumber, locals: Range) -> Line { + #[inline] + fn spanned_with_tokens(carried: LineIdx, locals: Range) -> Line { Line { continued_from: Some(carried), local_range: locals, @@ -192,7 +216,8 @@ impl Line { /// /// [`None`] means there is no such tokens. Otherwise range is guaranteed /// to not be empty. - fn local_range(&self) -> Option> { + #[inline] + fn local_range(&self) -> Option> { if self.local_range.is_empty() { None } else { @@ -204,43 +229,45 @@ impl Line { /// /// Counts both tokens that started on this line and tokens that continued /// from previous one. + #[inline] fn len(&self) -> usize { - (self.continued_from.is_some() as usize) + (self.local_range.end - self.local_range.start) + (if self.continued_from.is_some() { 1 } else { 0 }) + + (self.local_range.end - self.local_range.start) } } impl<'src> Tokenizer<'src> { /// Handles a token span and dispatches to the appropriate handler. - fn process_token_span(&mut self, token_span: TokenPiece<'src>) { - if token_can_span_lines(&token_span.token) { - self.process_multi_line_token(token_span); + fn process_token_piece(&mut self, token_piece: TokenPiece<'src>) { + if token_piece.token.can_span_lines() { + self.process_multi_line_token(token_piece); } else { - self.process_single_line_token(token_span); + self.process_single_line_token(token_piece); } } /// Handles tokens that never span multiple lines. - fn process_single_line_token(&mut self, token_span: TokenPiece<'src>) { - if token_is_newline(&token_span.token) { + fn process_single_line_token(&mut self, token_piece: TokenPiece<'src>) { + if token_piece.token.is_newline() { self.line_number += 1; - self.buffer.push(token_span); + self.buffer.push(token_piece); self.commit_current_line(); } else { - self.buffer.push(token_span); + self.buffer.push(token_piece); } } /// Handles tokens that may contain one or more newline characters. - fn process_multi_line_token(&mut self, token_span: TokenPiece<'src>) { + fn process_multi_line_token(&mut self, token_piece: TokenPiece<'src>) { let start_line = self.line_number; - let newline_count = count_newlines(token_span.lexeme); + let newline_count = count_line_breaks(token_piece.lexeme); // Did this token end in a newline? // This can happen if this is an `Error` token that ends the file. let ends_with_newline = - token_span.lexeme.ends_with('\n') || token_span.lexeme.ends_with('\r'); + token_piece.lexeme.ends_with('\n') || token_piece.lexeme.ends_with('\r'); - self.buffer.push(token_span); + self.buffer.push(token_piece); // We only need to commit the line if this token actually ended the line if newline_count > 0 { self.commit_current_line(); @@ -271,8 +298,8 @@ impl<'src> Tokenizer<'src> { let slice = self.slice_start_index..slice_end; // If we were in the middle of a multi-line token, we - // *always* consume `spanned_from` here, ensuring that each call to - // `commit_current_line()` only applies it once. + // *always* consume `multi_line_start` here, ensuring that each call + // to `commit_current_line()` only applies it once. // This guarantees no "bleed" between adjacent multi-line tokens. if let Some(from) = self.multi_line_start.take() { self.lines.push(Line::spanned_with_tokens(from, slice)); @@ -288,8 +315,9 @@ impl<'src> Tokenizer<'src> { fn into_tokenized_file(mut self) -> TokenizedFile<'src> { // Commit any trailing tokens self.commit_current_line(); - // If we still have a `spanned_from` (i.e. a pure multi-line token with - // no local tokens on its last line), push a bare `Spanned` entry. + // If we still have a `multi_line_start` + // (i.e. a pure multi-line token with no local tokens on its last line), + // push a bare `Line::spanned` entry. if let Some(from) = self.multi_line_start.take() { self.lines.push(Line::spanned(from)); } @@ -306,7 +334,7 @@ impl<'src> Tokenizer<'src> { } } -fn build_span<'src>(token: Token, text: &'src str) -> TokenPiece<'src> { +fn make_token_piece<'src>(token: Token, text: &'src str) -> TokenPiece<'src> { let length_utf16 = text.encode_utf16().count(); TokenPiece { lexeme: text, @@ -315,19 +343,8 @@ fn build_span<'src>(token: Token, text: &'src str) -> TokenPiece<'src> { } } -fn token_is_newline(token: &Token) -> bool { - matches!(token, Token::NewLine) -} - -fn token_can_span_lines(token: &Token) -> bool { - matches!( - token, - Token::BlockComment | Token::Brace(lexing::BraceKind::CppBlock) | Token::Error - ) -} - /// Counts the number of new lines in given text. -fn count_newlines(text: &str) -> usize { +fn count_line_breaks(text: &str) -> usize { let mut bytes_iterator = text.as_bytes().iter().peekable(); let mut newline_count = 0; while let Some(&next_byte) = bytes_iterator.next() {