Compare commits
4 Commits
4b9d6a6adb
...
e2d17f2e8a
Author | SHA1 | Date | |
---|---|---|---|
e2d17f2e8a | |||
9ab65b0b02 | |||
9ff20c7a60 | |||
579c2a4d3d |
@ -7,8 +7,6 @@
|
|||||||
//!
|
//!
|
||||||
//! These checks have been moved to the parent module.
|
//! These checks have been moved to the parent module.
|
||||||
|
|
||||||
use super::Line;
|
|
||||||
|
|
||||||
/// A technical trait that adds debug helpers to the lexer.
|
/// A technical trait that adds debug helpers to the lexer.
|
||||||
pub trait DebugTools {
|
pub trait DebugTools {
|
||||||
/// Pretty-prints the internal layout of the tokenised file - useful when
|
/// Pretty-prints the internal layout of the tokenised file - useful when
|
||||||
@ -31,62 +29,55 @@ pub trait DebugTools {
|
|||||||
|
|
||||||
impl<'src> DebugTools for super::TokenizedFile<'src> {
|
impl<'src> DebugTools for super::TokenizedFile<'src> {
|
||||||
fn reconstruct_source(&self) -> String {
|
fn reconstruct_source(&self) -> String {
|
||||||
let mut result = String::new();
|
self.buffer.iter().map(|span| span.lexeme).collect()
|
||||||
for line in &self.lines {
|
|
||||||
if let Line::Standalone(token_range) | Line::SpannedWithTokens(_, token_range) = line {
|
|
||||||
for span in &self.buffer[token_range.clone()] {
|
|
||||||
result.push_str(span.lexeme);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
result
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn dump_debug_layout(&self) {
|
fn dump_debug_layout(&self) {
|
||||||
for (row_index, line) in self.lines.iter().enumerate() {
|
for (row_idx, line) in self.lines.iter().enumerate() {
|
||||||
println!("Line {}", row_index + 1);
|
println!("Line {}", row_idx + 1);
|
||||||
match line {
|
|
||||||
Line::Standalone(token_range) => {
|
match (line.continued_from, line.local_range()) {
|
||||||
|
// Stand-alone line (all tokens start here)
|
||||||
|
(None, Some(range)) => {
|
||||||
println!("\t[Standalone]");
|
println!("\t[Standalone]");
|
||||||
let mut column_utf16 = 0usize;
|
dump_spans(&self.buffer[range.clone()]);
|
||||||
for next_token_span in &self.buffer[token_range.clone()] {
|
|
||||||
let token_beginning = column_utf16;
|
|
||||||
let token_end = column_utf16 + next_token_span.length_utf16;
|
|
||||||
println!(
|
|
||||||
"\t\t{:?} @ {}-{}: {:?}",
|
|
||||||
next_token_span.token,
|
|
||||||
token_beginning,
|
|
||||||
token_end,
|
|
||||||
next_token_span.lexeme
|
|
||||||
);
|
|
||||||
column_utf16 = token_end;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Line::Spanned(origin_row) => {
|
|
||||||
// `origin_row` is 0-based
|
// Pure continuation - the only thing on this line is
|
||||||
|
// the remainder of a multi-line token that started earlier.
|
||||||
|
(Some(origin_row), None) => {
|
||||||
println!(
|
println!(
|
||||||
"\t[Continued from line {} - no new tokens here]",
|
"\t[Continued from line {} – no new tokens here]",
|
||||||
origin_row + 1
|
origin_row + 1
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Line::SpannedWithTokens(origin_row, token_range) => {
|
|
||||||
// `origin_row` is 0-based
|
// Continuation **plus** some fresh tokens that begin here.
|
||||||
|
(Some(origin_row), Some(range)) => {
|
||||||
println!("\t[Continued from line {} + new tokens]", origin_row + 1);
|
println!("\t[Continued from line {} + new tokens]", origin_row + 1);
|
||||||
let mut column_utf16 = 0usize;
|
dump_spans(&self.buffer[range.clone()]);
|
||||||
for next_token_span in &self.buffer[token_range.clone()] {
|
}
|
||||||
let token_beginning = column_utf16;
|
|
||||||
let token_end = column_utf16 + next_token_span.length_utf16;
|
// An empty physical line (should be rare, but let's be safe).
|
||||||
println!(
|
(None, None) => {
|
||||||
"\t\t{:?} @ {}-{}: {:?}",
|
println!("\t[Empty line]");
|
||||||
next_token_span.token,
|
|
||||||
token_beginning,
|
|
||||||
token_end,
|
|
||||||
next_token_span.lexeme
|
|
||||||
);
|
|
||||||
column_utf16 = token_end;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Helper that prints every span in `spans` together with its UTF-16
|
||||||
|
/// column boundaries.
|
||||||
|
fn dump_spans<'a>(spans: &[super::TokenPiece<'a>]) {
|
||||||
|
let mut col_utf16 = 0usize;
|
||||||
|
for span in spans {
|
||||||
|
let start = col_utf16;
|
||||||
|
let end = start + span.length_utf16;
|
||||||
|
println!(
|
||||||
|
"\t\t{:?} @ {}–{}: {:?}",
|
||||||
|
span.token, start, end, span.lexeme
|
||||||
|
);
|
||||||
|
col_utf16 = end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
191
rottlib/src/lexer/iterator.rs
Normal file
191
rottlib/src/lexer/iterator.rs
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
//! Sub-module that adds an iterator to [`TokenizedFile`] which yields tokens in
|
||||||
|
//! the order they appear in the source code.
|
||||||
|
//!
|
||||||
|
//! ## Examples
|
||||||
|
//!
|
||||||
|
//! ```rust
|
||||||
|
//! let iter = TokenizedFile::from_str("0 / 0").tokens().without_whitespace();
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! ## Terminology: continued tokens
|
||||||
|
//!
|
||||||
|
//! Some [`super::Token`]s (e.g. [`super::Token::CppText`] or
|
||||||
|
//! [`super::Token::BlockComment`] can span multiple lines and are recorded on
|
||||||
|
//! every line on which they appear (usually as the first, and sometimes
|
||||||
|
//! the only, token).
|
||||||
|
//! In this module these are referred to as "continued" or
|
||||||
|
//! "carried-over" tokens.
|
||||||
|
//! Since our iterator needs to return each token only once, we take special
|
||||||
|
//! care to skip such continued tokens during iteration.
|
||||||
|
|
||||||
|
use super::{TokenLocation, TokenPiece, TokenizedFile};
|
||||||
|
|
||||||
|
/// An immutable iterator over all tokens in a [`TokenizedFile`], preserving
|
||||||
|
/// their order of appearance in the original source file.
|
||||||
|
///
|
||||||
|
/// After exhaustion it keeps returning [`None`].
|
||||||
|
#[must_use]
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Tokens<'src> {
|
||||||
|
/// [`TokenLocation`] of the next token to be returned.
|
||||||
|
///
|
||||||
|
/// [`None`] means the iterator has been exhausted.
|
||||||
|
cursor: Option<TokenLocation>,
|
||||||
|
/// [`TokenizedFile`] whose tokens we’re iterating over.
|
||||||
|
source_file: &'src TokenizedFile<'src>,
|
||||||
|
/// When `true`, whitespace tokens are skipped.
|
||||||
|
skip_whitespace: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Because we can only return [`None`] after we've returned it once.
|
||||||
|
impl<'src> std::iter::FusedIterator for Tokens<'src> {}
|
||||||
|
|
||||||
|
impl<'src> Tokens<'src> {
|
||||||
|
/// Makes the iterator skip all whitespace tokens.
|
||||||
|
#[must_use]
|
||||||
|
#[inline]
|
||||||
|
pub fn without_whitespace(mut self) -> Self {
|
||||||
|
self.skip_whitespace = true;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the position of the next new token, skipping carried-over pieces
|
||||||
|
// and blank lines.
|
||||||
|
fn advance_position(&self, mut position: TokenLocation) -> Option<TokenLocation> {
|
||||||
|
if let Some(current_line) = self.source_file.lines.get(position.line) {
|
||||||
|
// `Line::len()` also counts a possible token that continued from
|
||||||
|
// the previous line.
|
||||||
|
if position.column + 1 < current_line.len() {
|
||||||
|
position.column += 1;
|
||||||
|
return Some(position);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Current line is exhausted: walk downward until we find the first line
|
||||||
|
// that **owns local tokens**, because we only want *new* token,
|
||||||
|
// not continued from previous lines (they were already iterated over).
|
||||||
|
position.line += 1;
|
||||||
|
while let Some(next_line) = self.source_file.lines.get(position.line) {
|
||||||
|
if next_line.local_range().is_some() {
|
||||||
|
// Start at the first *local* token,
|
||||||
|
// skipping any carried-over one
|
||||||
|
position.column = if next_line.continued_from.is_some() {
|
||||||
|
1
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
return Some(position);
|
||||||
|
}
|
||||||
|
position.line += 1; // keep skipping empty / pure-carried lines
|
||||||
|
}
|
||||||
|
// No more tokens.
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates a new iterator.
|
||||||
|
fn new(source_file: &'src TokenizedFile) -> Tokens<'src> {
|
||||||
|
let mut new_iterator = Tokens {
|
||||||
|
source_file,
|
||||||
|
cursor: Some(TokenLocation { line: 0, column: 0 }),
|
||||||
|
skip_whitespace: false,
|
||||||
|
};
|
||||||
|
// We need to land on the first existing token so [`Iterator::next`]
|
||||||
|
// can assume cursor is valid.
|
||||||
|
while let Some(token_position) = new_iterator.cursor {
|
||||||
|
if new_iterator.source_file.get(token_position).is_some() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
new_iterator.cursor = new_iterator.advance_position(token_position);
|
||||||
|
}
|
||||||
|
new_iterator
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'src> Iterator for Tokens<'src> {
|
||||||
|
type Item = (TokenLocation, TokenPiece<'src>);
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
// We only ever loop to discard whitespaces when the flag is on
|
||||||
|
loop {
|
||||||
|
let current_cursor = self.cursor?;
|
||||||
|
let token_piece = *self.source_file.get(current_cursor)?;
|
||||||
|
self.cursor = self.advance_position(current_cursor);
|
||||||
|
|
||||||
|
// Optional whitespace-skip
|
||||||
|
if !self.skip_whitespace || !token_piece.token.is_whitespace() {
|
||||||
|
return Some((current_cursor, token_piece));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'src> TokenizedFile<'src> {
|
||||||
|
// Returns the final local token in `line_number`
|
||||||
|
// (used to resolve column 0 of a continued line).
|
||||||
|
fn last_piece_in_line(&self, line_number: usize) -> Option<&TokenPiece> {
|
||||||
|
self.lines
|
||||||
|
.get(line_number)
|
||||||
|
.and_then(|line| line.local_range())
|
||||||
|
// `Line::local_range()` is guaranteed to return non-empty `Range`.
|
||||||
|
.and_then(|range| self.buffer.get(range.end - 1))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns [`TokenPiece`] at a given location if it exists.
|
||||||
|
///
|
||||||
|
/// If the line specified by [`TokenLocation`] starts with a token that
|
||||||
|
/// continues from the previous line - column `0` refers to that token.
|
||||||
|
///
|
||||||
|
/// Never panics, invalid position returns [`None`].
|
||||||
|
///
|
||||||
|
/// ## Examples
|
||||||
|
///
|
||||||
|
/// ```rust
|
||||||
|
/// use mycrate::{TokenizedFile, TokenLocation, Token};
|
||||||
|
/// let file = TokenizedFile::from_str("0 / 0");
|
||||||
|
/// assert_eq!(
|
||||||
|
/// file.get(TokenLocation { line: 0, column: 2 }).map(|p| p.token),
|
||||||
|
/// Some(Token::Divide),
|
||||||
|
/// );
|
||||||
|
/// ```
|
||||||
|
#[track_caller]
|
||||||
|
pub fn get(&self, position: TokenLocation) -> Option<&TokenPiece> {
|
||||||
|
let line = self.lines.get(position.line)?;
|
||||||
|
let column = position.column;
|
||||||
|
if column >= line.len() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
if let Some(spanned_line_number) = line.continued_from
|
||||||
|
&& column == 0
|
||||||
|
{
|
||||||
|
self.last_piece_in_line(spanned_line_number)
|
||||||
|
} else {
|
||||||
|
// If we have a token that continued from the previous line,
|
||||||
|
// then, relative to `self.buffer`, our `column` is actually 1-based
|
||||||
|
// and we need to shift it back to being 0-based.
|
||||||
|
let token_position =
|
||||||
|
line.local_range.start + column - if line.continued_from.is_some() { 1 } else { 0 };
|
||||||
|
self.buffer.get(token_position)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns an iterator over all contained tokens in the order they appear
|
||||||
|
/// in the original source file.
|
||||||
|
///
|
||||||
|
/// By default includes all tokens, including whitespace and comments.
|
||||||
|
///
|
||||||
|
/// Returns the same iterator as [`TokenizedFile::into_iter`]
|
||||||
|
#[must_use]
|
||||||
|
#[inline]
|
||||||
|
pub fn tokens(&'src self) -> Tokens<'src> {
|
||||||
|
Tokens::new(self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'src> IntoIterator for &'src TokenizedFile<'src> {
|
||||||
|
type Item = (TokenLocation, TokenPiece<'src>);
|
||||||
|
type IntoIter = Tokens<'src>;
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn into_iter(self) -> Self::IntoIter {
|
||||||
|
self.tokens()
|
||||||
|
}
|
||||||
|
}
|
@ -2,7 +2,7 @@
|
|||||||
//!
|
//!
|
||||||
//! ## Notable details
|
//! ## Notable details
|
||||||
//!
|
//!
|
||||||
//! Lexer for UnrealScript that recognises inline `cpptext { … }` blocks.
|
//! Lexer for UnrealScript that recognizes inline `cpptext { … }` blocks.
|
||||||
//!
|
//!
|
||||||
//! In UnrealScript, `cpptext` lets authors embed raw C++ between braces.
|
//! In UnrealScript, `cpptext` lets authors embed raw C++ between braces.
|
||||||
//! Because whitespace, newlines, or comments may appear between the
|
//! Because whitespace, newlines, or comments may appear between the
|
||||||
@ -41,14 +41,14 @@ pub struct LexerState {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Are these braces "real" UnrealScript braces, or the start/end of a C++ block?
|
/// Are these braces "real" UnrealScript braces, or the start/end of a C++ block?
|
||||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
|
||||||
pub enum BraceKind {
|
pub enum BraceKind {
|
||||||
Normal,
|
Normal,
|
||||||
CppBlock,
|
CppBlock,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// All UnrealScript tokens that our compiler distinguishes.
|
/// All UnrealScript tokens that our compiler distinguishes.
|
||||||
#[derive(logos::Logos, Debug, PartialEq, Clone, Copy)]
|
#[derive(logos::Logos, Debug, PartialEq, Eq, Hash, Clone, Copy)]
|
||||||
#[logos(extras = LexerState)]
|
#[logos(extras = LexerState)]
|
||||||
pub enum Token {
|
pub enum Token {
|
||||||
// # Compiler/directive keywords
|
// # Compiler/directive keywords
|
||||||
@ -247,9 +247,9 @@ pub enum Token {
|
|||||||
#[token("~")]
|
#[token("~")]
|
||||||
BitwiseNot,
|
BitwiseNot,
|
||||||
// ## Vector
|
// ## Vector
|
||||||
#[token("dot")]
|
#[regex("(?i)dot")]
|
||||||
Dot,
|
Dot,
|
||||||
#[token("cross")]
|
#[regex("(?i)cross")]
|
||||||
Cross,
|
Cross,
|
||||||
// ## Multiplicative
|
// ## Multiplicative
|
||||||
#[token("*")]
|
#[token("*")]
|
||||||
@ -290,6 +290,8 @@ pub enum Token {
|
|||||||
NotEqual,
|
NotEqual,
|
||||||
#[token("~=")]
|
#[token("~=")]
|
||||||
ApproximatelyEqual,
|
ApproximatelyEqual,
|
||||||
|
#[regex("(?i)clockwisefrom")]
|
||||||
|
ClockwiseFrom,
|
||||||
// ## Bitwise
|
// ## Bitwise
|
||||||
#[token("&")]
|
#[token("&")]
|
||||||
BitwiseAnd,
|
BitwiseAnd,
|
||||||
@ -297,11 +299,11 @@ pub enum Token {
|
|||||||
BitwiseOr,
|
BitwiseOr,
|
||||||
#[token("^")]
|
#[token("^")]
|
||||||
BitwiseXor,
|
BitwiseXor,
|
||||||
#[token("^^")]
|
|
||||||
BooleanXor,
|
|
||||||
// ## Logical
|
// ## Logical
|
||||||
#[token("&&")]
|
#[token("&&")]
|
||||||
And,
|
And,
|
||||||
|
#[token("^^")]
|
||||||
|
Xor,
|
||||||
#[token("||")]
|
#[token("||")]
|
||||||
Or,
|
Or,
|
||||||
// ## Assigments
|
// ## Assigments
|
||||||
@ -311,6 +313,8 @@ pub enum Token {
|
|||||||
MultiplyAssign,
|
MultiplyAssign,
|
||||||
#[token("/=")]
|
#[token("/=")]
|
||||||
DivideAssign,
|
DivideAssign,
|
||||||
|
#[token("%=")]
|
||||||
|
ModuloAssign,
|
||||||
#[token("+=")]
|
#[token("+=")]
|
||||||
PlusAssign,
|
PlusAssign,
|
||||||
#[token("-=")]
|
#[token("-=")]
|
||||||
@ -341,6 +345,10 @@ pub enum Token {
|
|||||||
Period,
|
Period,
|
||||||
#[token(":")]
|
#[token(":")]
|
||||||
Colon,
|
Colon,
|
||||||
|
#[token("#")]
|
||||||
|
Hash,
|
||||||
|
#[token("?")]
|
||||||
|
Question,
|
||||||
|
|
||||||
// # Comments & whitespaces
|
// # Comments & whitespaces
|
||||||
#[regex(r"//[^\r\n]*")]
|
#[regex(r"//[^\r\n]*")]
|
||||||
@ -356,6 +364,30 @@ pub enum Token {
|
|||||||
Error,
|
Error,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Token {
|
||||||
|
/// Returns `true` if this token is a newline (`Token::NewLine`).
|
||||||
|
pub fn is_newline(&self) -> bool {
|
||||||
|
matches!(self, Token::NewLine)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if this token is trivia whitespace
|
||||||
|
/// (`Token::Whitespace` or `Token::NewLine`).
|
||||||
|
///
|
||||||
|
/// Note: comments are **not** considered whitespace.
|
||||||
|
pub fn is_whitespace(&self) -> bool {
|
||||||
|
matches!(&self, Token::Whitespace | Token::NewLine)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if this token may span multiple physical lines
|
||||||
|
/// (i.e. can contain newline characters).
|
||||||
|
pub fn can_span_lines(&self) -> bool {
|
||||||
|
matches!(
|
||||||
|
self,
|
||||||
|
Token::BlockComment | Token::Brace(BraceKind::CppBlock) | Token::Error
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Consume a /* ... */ block comment with arbitrary nesting
|
/// Consume a /* ... */ block comment with arbitrary nesting
|
||||||
/// (like UnrealScript allows).
|
/// (like UnrealScript allows).
|
||||||
///
|
///
|
||||||
|
@ -12,22 +12,28 @@
|
|||||||
//! precompute lengths of each token in that encoding, making interfacing
|
//! precompute lengths of each token in that encoding, making interfacing
|
||||||
//! easier.
|
//! easier.
|
||||||
//!
|
//!
|
||||||
|
//! ## Iteration over tokens
|
||||||
|
//!
|
||||||
|
//! For simplicity we've moved out code for iterating over tokens of
|
||||||
|
//! [`TokenizedFile`] into a separate submodule [`iterator`].
|
||||||
|
//!
|
||||||
//! ## Opt-in debug helpers
|
//! ## Opt-in debug helpers
|
||||||
//!
|
//!
|
||||||
//! Extra diagnostics become available in **debug builds** or when the crate is
|
//! Extra diagnostics become available in **debug builds** or when the crate is
|
||||||
//! compiled with `debug` feature enabled. They live in the [`DebugTools`]
|
//! compiled with `debug` feature enabled. They live in the [`debug_tools`]
|
||||||
//! extension trait, implemented for [`TokenizedFile`].
|
//! extension trait, implemented for [`TokenizedFile`].
|
||||||
//!
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
//! // bring the trait into scope
|
//! // bring the trait into scope
|
||||||
//! use lexer::DebugTools;
|
//! use lexer::DebugTools;
|
||||||
//!
|
//!
|
||||||
//! let file = TokenizedFile::from_source(src);
|
//! let file = TokenizedFile::from_str(src);
|
||||||
//! file.debug_dump(); // pretty-print token layout
|
//! file.debug_dump(); // pretty-print token layout
|
||||||
//! let text = file.to_source(); // reconstruct original text
|
//! let text = file.to_source(); // reconstruct original text
|
||||||
//! ```
|
//! ```
|
||||||
|
|
||||||
mod debug_tools;
|
mod debug_tools;
|
||||||
|
mod iterator;
|
||||||
mod lexing;
|
mod lexing;
|
||||||
|
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
@ -36,6 +42,7 @@ use logos::Logos;
|
|||||||
|
|
||||||
#[cfg(any(debug_assertions, feature = "debug"))]
|
#[cfg(any(debug_assertions, feature = "debug"))]
|
||||||
pub use debug_tools::DebugTools;
|
pub use debug_tools::DebugTools;
|
||||||
|
pub use iterator::Tokens;
|
||||||
pub use lexing::Token;
|
pub use lexing::Token;
|
||||||
|
|
||||||
/// Empirically chosen starting size for token buffer (used during tokenization)
|
/// Empirically chosen starting size for token buffer (used during tokenization)
|
||||||
@ -45,34 +52,34 @@ const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000;
|
|||||||
/// A slice tagged with its token kind plus two length counters.
|
/// A slice tagged with its token kind plus two length counters.
|
||||||
///
|
///
|
||||||
/// *No absolute coordinates* are stored - they are recomputed per line.
|
/// *No absolute coordinates* are stored - they are recomputed per line.
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Hash, Clone, Copy, PartialEq, Eq)]
|
||||||
struct TokenSpan<'src> {
|
pub struct TokenPiece<'src> {
|
||||||
lexeme: &'src str,
|
/// Token, represented by this [`TokenPiece`].
|
||||||
token: Token,
|
pub token: Token,
|
||||||
length_utf16: usize,
|
/// Underlying text that was lexed as the corresponding token.
|
||||||
|
pub lexeme: &'src str,
|
||||||
|
/// Length of the token in UTF-16 code units for the needs of easy seeking
|
||||||
|
/// using given LSP cursor coordinates (line + UTF-16 offset).
|
||||||
|
/// Precomputed for convenience.
|
||||||
|
pub length_utf16: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Representation of a single physical line of the source file.
|
/// Defines location of a token inside [`TokenizedFile`] in a way, convenient
|
||||||
///
|
/// for communicating through LSP.
|
||||||
/// [`Range<usize>`] are used instead of slices to avoid creating
|
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
/// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
|
pub struct TokenLocation {
|
||||||
#[derive(Clone)]
|
/// 0-based line number.
|
||||||
enum Line {
|
pub line: usize,
|
||||||
/// A standalone line that owns a contiguous slice in
|
/// 0-based index of a token in the line, possibly including the token that
|
||||||
/// the [`TokenizedFile::buffer`] arena.
|
/// has continued from the previous line.
|
||||||
Standalone(Range<usize>),
|
pub column: usize,
|
||||||
/// A 0-based line that is part of a multi-line token started on
|
|
||||||
/// another line.
|
|
||||||
Spanned(usize),
|
|
||||||
/// A 0-based line that is part of a multi-line token started on
|
|
||||||
/// another line *and* contains additional tokens local to itself.
|
|
||||||
SpannedWithTokens(usize, Range<usize>),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A tokenized, lossless representation of an UnrealScript source file.
|
/// A tokenized, lossless representation of an UnrealScript source file.
|
||||||
|
#[derive(Debug)]
|
||||||
pub struct TokenizedFile<'src> {
|
pub struct TokenizedFile<'src> {
|
||||||
/// Arena of every token span in this file.
|
/// Arena of every token span in this file.
|
||||||
buffer: Vec<TokenSpan<'src>>,
|
buffer: Vec<TokenPiece<'src>>,
|
||||||
/// Mapping that provides an easy and efficient access to tokens by
|
/// Mapping that provides an easy and efficient access to tokens by
|
||||||
/// line number.
|
/// line number.
|
||||||
lines: Vec<Line>,
|
lines: Vec<Line>,
|
||||||
@ -81,14 +88,17 @@ pub struct TokenizedFile<'src> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Mutable state that encapsulates data needed during the tokenization loop.
|
/// Mutable state that encapsulates data needed during the tokenization loop.
|
||||||
|
///
|
||||||
|
/// Access to stored tokens is provided through the [`iterator::Tokens`]
|
||||||
|
/// iterator.
|
||||||
struct Tokenizer<'src> {
|
struct Tokenizer<'src> {
|
||||||
/// Arena that owns every [`TokenSpan`] produced for the file.
|
/// Arena that owns every [`TokenPiece`] produced for the file.
|
||||||
buffer: Vec<TokenSpan<'src>>,
|
buffer: Vec<TokenPiece<'src>>,
|
||||||
/// Mapping from physical line number to the tokens that belong to it.
|
/// Mapping from physical line number to the tokens that belong to it.
|
||||||
lines: Vec<Line>,
|
lines: Vec<Line>,
|
||||||
/// The current 0-based physical line number.
|
/// The current 0-based physical line number.
|
||||||
line_number: usize,
|
line_number: usize,
|
||||||
/// Index in [`Tokenizer::buffer`] where the current line starts.
|
/// Index in [`Tokenizer::buffer`] where the current *line* starts.
|
||||||
slice_start_index: usize,
|
slice_start_index: usize,
|
||||||
/// When a multi-line token is being scanned, stores the 0-based line
|
/// When a multi-line token is being scanned, stores the 0-based line
|
||||||
/// on which it started; [`None`] otherwise.
|
/// on which it started; [`None`] otherwise.
|
||||||
@ -99,25 +109,43 @@ struct Tokenizer<'src> {
|
|||||||
|
|
||||||
impl<'src> TokenizedFile<'src> {
|
impl<'src> TokenizedFile<'src> {
|
||||||
/// Tokenize `source` and return a fresh [`TokenizedFile`].
|
/// Tokenize `source` and return a fresh [`TokenizedFile`].
|
||||||
pub fn from_source(source: &'src str) -> TokenizedFile<'src> {
|
///
|
||||||
let mut tokenizer = TokenizedFile::<'src>::builder();
|
/// ## Examples
|
||||||
|
///
|
||||||
|
/// ```rust
|
||||||
|
/// let source_text = "2 + 2 * 2".to_string();
|
||||||
|
/// let tokenized_file = TokenizedFile::from_str(&source_text);
|
||||||
|
/// ```
|
||||||
|
#[must_use]
|
||||||
|
pub fn from_str(source: &'src str) -> TokenizedFile<'src> {
|
||||||
|
let mut tokenizer = Self::builder();
|
||||||
let mut lexer = Token::lexer(source);
|
let mut lexer = Token::lexer(source);
|
||||||
|
|
||||||
// Logos > Ok() > token > token span <- plugged into tokenizer
|
|
||||||
while let Some(token_result) = lexer.next() {
|
while let Some(token_result) = lexer.next() {
|
||||||
|
// Add `Token:Error` manually, since Logos won't do it for us.
|
||||||
let token = token_result.unwrap_or_else(|_| {
|
let token = token_result.unwrap_or_else(|_| {
|
||||||
tokenizer.had_errors = true;
|
tokenizer.had_errors = true;
|
||||||
Token::Error
|
Token::Error
|
||||||
});
|
});
|
||||||
let token_span = build_span(token, lexer.slice());
|
let token_piece = make_token_piece(token, lexer.slice());
|
||||||
tokenizer.process_token_span(token_span);
|
tokenizer.process_token_piece(token_piece);
|
||||||
}
|
}
|
||||||
tokenizer.into_tokenized_file()
|
tokenizer.into_tokenized_file()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns [`true`] if any erroneous tokens were produced during building
|
/// Returns [`true`] if any erroneous tokens were produced during building
|
||||||
/// of this [`TokenizedFile`].
|
/// of this [`TokenizedFile`].
|
||||||
pub fn had_errors(&self) -> bool {
|
///
|
||||||
|
/// ## Examples
|
||||||
|
///
|
||||||
|
/// ```rust
|
||||||
|
/// let tokenized_file = TokenizedFile::from_str("function test() {}");
|
||||||
|
/// if tokenized_file.has_errors() {
|
||||||
|
/// println!("Error while parsing file: {}", path.display());
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
#[inline]
|
||||||
|
pub fn has_errors(&self) -> bool {
|
||||||
self.had_errors
|
self.had_errors
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -134,38 +162,112 @@ impl<'src> TokenizedFile<'src> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Type for indexing lines in a [`TokenizedFile`].
|
||||||
|
type LineIdx = usize;
|
||||||
|
|
||||||
|
/// Type for specific tokens inside each [`Line`].
|
||||||
|
type TokenIdx = usize;
|
||||||
|
|
||||||
|
/// Representation of a single physical line of the source file.
|
||||||
|
///
|
||||||
|
/// [`Range<TokenIndex>`] are used instead of slices to avoid creating
|
||||||
|
/// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
|
||||||
|
#[derive(Clone, Debug, Hash, PartialEq, Eq)]
|
||||||
|
struct Line {
|
||||||
|
/// Token that began on an earlier line (`None` for standalone lines).
|
||||||
|
continued_from: Option<LineIdx>,
|
||||||
|
/// Contiguous tokens that started on this line (`start >= end` iff empty).
|
||||||
|
local_range: Range<TokenIdx>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Line {
|
||||||
|
/// Creates a standalone line that owns a contiguous slice in
|
||||||
|
/// the [`TokenizedFile::buffer`] arena.
|
||||||
|
#[inline]
|
||||||
|
fn standalone(locals: Range<TokenIdx>) -> Line {
|
||||||
|
Line {
|
||||||
|
continued_from: None,
|
||||||
|
local_range: locals,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a line that is part of a multi-line token started on
|
||||||
|
/// another line, referencing the 0-based index of its origin.
|
||||||
|
#[inline]
|
||||||
|
fn spanned(carried: LineIdx) -> Line {
|
||||||
|
Line {
|
||||||
|
continued_from: Some(carried),
|
||||||
|
local_range: 0..0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a line that is part of a multi-line token started on
|
||||||
|
/// another line and also contains additional tokens local to itself.
|
||||||
|
#[inline]
|
||||||
|
fn spanned_with_tokens(carried: LineIdx, locals: Range<TokenIdx>) -> Line {
|
||||||
|
Line {
|
||||||
|
continued_from: Some(carried),
|
||||||
|
local_range: locals,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a range of tokens inside [`TokenizedFile::buffer`] that start
|
||||||
|
/// on this line.
|
||||||
|
///
|
||||||
|
/// [`None`] means there is no such tokens. Otherwise range is guaranteed
|
||||||
|
/// to not be empty.
|
||||||
|
#[inline]
|
||||||
|
fn local_range(&self) -> Option<Range<TokenIdx>> {
|
||||||
|
if self.local_range.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(self.local_range.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns amount of tokens of the line.
|
||||||
|
///
|
||||||
|
/// Counts both tokens that started on this line and tokens that continued
|
||||||
|
/// from previous one.
|
||||||
|
#[inline]
|
||||||
|
fn len(&self) -> usize {
|
||||||
|
(if self.continued_from.is_some() { 1 } else { 0 })
|
||||||
|
+ (self.local_range.end - self.local_range.start)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<'src> Tokenizer<'src> {
|
impl<'src> Tokenizer<'src> {
|
||||||
/// Handles a token span and dispatches to the appropriate handler.
|
/// Handles a token span and dispatches to the appropriate handler.
|
||||||
fn process_token_span(&mut self, token_span: TokenSpan<'src>) {
|
fn process_token_piece(&mut self, token_piece: TokenPiece<'src>) {
|
||||||
if token_can_span_lines(&token_span.token) {
|
if token_piece.token.can_span_lines() {
|
||||||
self.process_multi_line_token(token_span);
|
self.process_multi_line_token(token_piece);
|
||||||
} else {
|
} else {
|
||||||
self.process_single_line_token(token_span);
|
self.process_single_line_token(token_piece);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Handles tokens that never span multiple lines.
|
/// Handles tokens that never span multiple lines.
|
||||||
fn process_single_line_token(&mut self, token_span: TokenSpan<'src>) {
|
fn process_single_line_token(&mut self, token_piece: TokenPiece<'src>) {
|
||||||
if token_is_newline(&token_span.token) {
|
if token_piece.token.is_newline() {
|
||||||
self.line_number += 1;
|
self.line_number += 1;
|
||||||
self.buffer.push(token_span);
|
self.buffer.push(token_piece);
|
||||||
self.commit_current_line();
|
self.commit_current_line();
|
||||||
} else {
|
} else {
|
||||||
self.buffer.push(token_span);
|
self.buffer.push(token_piece);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Handles tokens that may contain one or more newline characters.
|
/// Handles tokens that may contain one or more newline characters.
|
||||||
fn process_multi_line_token(&mut self, token_span: TokenSpan<'src>) {
|
fn process_multi_line_token(&mut self, token_piece: TokenPiece<'src>) {
|
||||||
let start_line = self.line_number;
|
let start_line = self.line_number;
|
||||||
let newline_count = count_newlines(token_span.lexeme);
|
let newline_count = count_line_breaks(token_piece.lexeme);
|
||||||
|
|
||||||
// Did this token end in a newline?
|
// Did this token end in a newline?
|
||||||
// This can happen if this is an `Error` token that ends the file.
|
// This can happen if this is an `Error` token that ends the file.
|
||||||
let ends_with_newline =
|
let ends_with_newline =
|
||||||
token_span.lexeme.ends_with('\n') || token_span.lexeme.ends_with('\r');
|
token_piece.lexeme.ends_with('\n') || token_piece.lexeme.ends_with('\r');
|
||||||
|
|
||||||
self.buffer.push(token_span);
|
self.buffer.push(token_piece);
|
||||||
// We only need to commit the line if this token actually ended the line
|
// We only need to commit the line if this token actually ended the line
|
||||||
if newline_count > 0 {
|
if newline_count > 0 {
|
||||||
self.commit_current_line();
|
self.commit_current_line();
|
||||||
@ -175,7 +277,7 @@ impl<'src> Tokenizer<'src> {
|
|||||||
// exactly `1` interior line)
|
// exactly `1` interior line)
|
||||||
let insert_count = newline_count - 1;
|
let insert_count = newline_count - 1;
|
||||||
for _ in 0..insert_count {
|
for _ in 0..insert_count {
|
||||||
self.lines.push(Line::Spanned(start_line));
|
self.lines.push(Line::spanned(start_line));
|
||||||
}
|
}
|
||||||
// This is called *after* `commit_current_line()` cleared previous
|
// This is called *after* `commit_current_line()` cleared previous
|
||||||
// stored value
|
// stored value
|
||||||
@ -196,13 +298,13 @@ impl<'src> Tokenizer<'src> {
|
|||||||
let slice = self.slice_start_index..slice_end;
|
let slice = self.slice_start_index..slice_end;
|
||||||
|
|
||||||
// If we were in the middle of a multi-line token, we
|
// If we were in the middle of a multi-line token, we
|
||||||
// *always* consume `spanned_from` here, ensuring that each call to
|
// *always* consume `multi_line_start` here, ensuring that each call
|
||||||
// `commit_current_line()` only applies it once.
|
// to `commit_current_line()` only applies it once.
|
||||||
// This guarantees no "bleed" between adjacent multi-line tokens.
|
// This guarantees no "bleed" between adjacent multi-line tokens.
|
||||||
if let Some(from) = self.multi_line_start.take() {
|
if let Some(from) = self.multi_line_start.take() {
|
||||||
self.lines.push(Line::SpannedWithTokens(from, slice));
|
self.lines.push(Line::spanned_with_tokens(from, slice));
|
||||||
} else {
|
} else {
|
||||||
self.lines.push(Line::Standalone(slice));
|
self.lines.push(Line::standalone(slice));
|
||||||
}
|
}
|
||||||
self.slice_start_index = slice_end;
|
self.slice_start_index = slice_end;
|
||||||
}
|
}
|
||||||
@ -213,10 +315,11 @@ impl<'src> Tokenizer<'src> {
|
|||||||
fn into_tokenized_file(mut self) -> TokenizedFile<'src> {
|
fn into_tokenized_file(mut self) -> TokenizedFile<'src> {
|
||||||
// Commit any trailing tokens
|
// Commit any trailing tokens
|
||||||
self.commit_current_line();
|
self.commit_current_line();
|
||||||
// If we still have a `spanned_from` (i.e. a pure multi-line token with
|
// If we still have a `multi_line_start`
|
||||||
// no local tokens on its last line), push a bare `Spanned` entry.
|
// (i.e. a pure multi-line token with no local tokens on its last line),
|
||||||
|
// push a bare `Line::spanned` entry.
|
||||||
if let Some(from) = self.multi_line_start.take() {
|
if let Some(from) = self.multi_line_start.take() {
|
||||||
self.lines.push(Line::Spanned(from));
|
self.lines.push(Line::spanned(from));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Optimize for size
|
// Optimize for size
|
||||||
@ -231,28 +334,17 @@ impl<'src> Tokenizer<'src> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_span<'src>(token: Token, text: &'src str) -> TokenSpan<'src> {
|
fn make_token_piece<'src>(token: Token, text: &'src str) -> TokenPiece<'src> {
|
||||||
let length_utf16 = text.encode_utf16().count();
|
let length_utf16 = text.encode_utf16().count();
|
||||||
TokenSpan {
|
TokenPiece {
|
||||||
lexeme: text,
|
lexeme: text,
|
||||||
token,
|
token,
|
||||||
length_utf16,
|
length_utf16,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token_is_newline(token: &Token) -> bool {
|
|
||||||
matches!(token, Token::NewLine)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn token_can_span_lines(token: &Token) -> bool {
|
|
||||||
matches!(
|
|
||||||
token,
|
|
||||||
Token::BlockComment | Token::Brace(lexing::BraceKind::CppBlock) | Token::Error
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Counts the number of new lines in given text.
|
/// Counts the number of new lines in given text.
|
||||||
fn count_newlines(text: &str) -> usize {
|
fn count_line_breaks(text: &str) -> usize {
|
||||||
let mut bytes_iterator = text.as_bytes().iter().peekable();
|
let mut bytes_iterator = text.as_bytes().iter().peekable();
|
||||||
let mut newline_count = 0;
|
let mut newline_count = 0;
|
||||||
while let Some(&next_byte) = bytes_iterator.next() {
|
while let Some(&next_byte) = bytes_iterator.next() {
|
||||||
|
Loading…
Reference in New Issue
Block a user