Refactor Line
Previous definition of `Line` type was obnoxious and too difficult to work with. This one should make iterator implementation much easier and has clearer structure on its own.
This commit is contained in:
parent
4b9d6a6adb
commit
579c2a4d3d
@ -7,8 +7,6 @@
|
|||||||
//!
|
//!
|
||||||
//! These checks have been moved to the parent module.
|
//! These checks have been moved to the parent module.
|
||||||
|
|
||||||
use super::Line;
|
|
||||||
|
|
||||||
/// A technical trait that adds debug helpers to the lexer.
|
/// A technical trait that adds debug helpers to the lexer.
|
||||||
pub trait DebugTools {
|
pub trait DebugTools {
|
||||||
/// Pretty-prints the internal layout of the tokenised file - useful when
|
/// Pretty-prints the internal layout of the tokenised file - useful when
|
||||||
@ -31,62 +29,55 @@ pub trait DebugTools {
|
|||||||
|
|
||||||
impl<'src> DebugTools for super::TokenizedFile<'src> {
|
impl<'src> DebugTools for super::TokenizedFile<'src> {
|
||||||
fn reconstruct_source(&self) -> String {
|
fn reconstruct_source(&self) -> String {
|
||||||
let mut result = String::new();
|
self.buffer.iter().map(|span| span.lexeme).collect()
|
||||||
for line in &self.lines {
|
|
||||||
if let Line::Standalone(token_range) | Line::SpannedWithTokens(_, token_range) = line {
|
|
||||||
for span in &self.buffer[token_range.clone()] {
|
|
||||||
result.push_str(span.lexeme);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
result
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn dump_debug_layout(&self) {
|
fn dump_debug_layout(&self) {
|
||||||
for (row_index, line) in self.lines.iter().enumerate() {
|
for (row_idx, line) in self.lines.iter().enumerate() {
|
||||||
println!("Line {}", row_index + 1);
|
println!("Line {}", row_idx + 1);
|
||||||
match line {
|
|
||||||
Line::Standalone(token_range) => {
|
match (line.continued_from, line.local_range()) {
|
||||||
|
// Stand-alone line (all tokens start here)
|
||||||
|
(None, Some(range)) => {
|
||||||
println!("\t[Standalone]");
|
println!("\t[Standalone]");
|
||||||
let mut column_utf16 = 0usize;
|
dump_spans(&self.buffer[range.clone()]);
|
||||||
for next_token_span in &self.buffer[token_range.clone()] {
|
|
||||||
let token_beginning = column_utf16;
|
|
||||||
let token_end = column_utf16 + next_token_span.length_utf16;
|
|
||||||
println!(
|
|
||||||
"\t\t{:?} @ {}-{}: {:?}",
|
|
||||||
next_token_span.token,
|
|
||||||
token_beginning,
|
|
||||||
token_end,
|
|
||||||
next_token_span.lexeme
|
|
||||||
);
|
|
||||||
column_utf16 = token_end;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
Line::Spanned(origin_row) => {
|
// Pure continuation - the only thing on this line is
|
||||||
// `origin_row` is 0-based
|
// the remainder of a multi-line token that started earlier.
|
||||||
|
(Some(origin_row), None) => {
|
||||||
println!(
|
println!(
|
||||||
"\t[Continued from line {} - no new tokens here]",
|
"\t[Continued from line {} – no new tokens here]",
|
||||||
origin_row + 1
|
origin_row + 1
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Line::SpannedWithTokens(origin_row, token_range) => {
|
|
||||||
// `origin_row` is 0-based
|
// Continuation **plus** some fresh tokens that begin here.
|
||||||
|
(Some(origin_row), Some(range)) => {
|
||||||
println!("\t[Continued from line {} + new tokens]", origin_row + 1);
|
println!("\t[Continued from line {} + new tokens]", origin_row + 1);
|
||||||
let mut column_utf16 = 0usize;
|
dump_spans(&self.buffer[range.clone()]);
|
||||||
for next_token_span in &self.buffer[token_range.clone()] {
|
|
||||||
let token_beginning = column_utf16;
|
|
||||||
let token_end = column_utf16 + next_token_span.length_utf16;
|
|
||||||
println!(
|
|
||||||
"\t\t{:?} @ {}-{}: {:?}",
|
|
||||||
next_token_span.token,
|
|
||||||
token_beginning,
|
|
||||||
token_end,
|
|
||||||
next_token_span.lexeme
|
|
||||||
);
|
|
||||||
column_utf16 = token_end;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// An empty physical line (should be rare, but let's be safe).
|
||||||
|
(None, None) => {
|
||||||
|
println!("\t[Empty line]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Helper that prints every span in `spans` together with its UTF-16
|
||||||
|
/// column boundaries.
|
||||||
|
fn dump_spans<'a>(spans: &[super::TokenPiece<'a>]) {
|
||||||
|
let mut col_utf16 = 0usize;
|
||||||
|
for span in spans {
|
||||||
|
let start = col_utf16;
|
||||||
|
let end = start + span.length_utf16;
|
||||||
|
println!(
|
||||||
|
"\t\t{:?} @ {}–{}: {:?}",
|
||||||
|
span.token, start, end, span.lexeme
|
||||||
|
);
|
||||||
|
col_utf16 = end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -30,7 +30,7 @@
|
|||||||
mod debug_tools;
|
mod debug_tools;
|
||||||
mod lexing;
|
mod lexing;
|
||||||
|
|
||||||
use std::ops::Range;
|
use std::{cmp::Ordering, ops::Range};
|
||||||
|
|
||||||
use logos::Logos;
|
use logos::Logos;
|
||||||
|
|
||||||
@ -46,27 +46,36 @@ const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000;
|
|||||||
///
|
///
|
||||||
/// *No absolute coordinates* are stored - they are recomputed per line.
|
/// *No absolute coordinates* are stored - they are recomputed per line.
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
struct TokenSpan<'src> {
|
pub struct TokenSpan<'src> {
|
||||||
lexeme: &'src str,
|
pub lexeme: &'src str,
|
||||||
token: Token,
|
pub token: Token,
|
||||||
length_utf16: usize,
|
pub length_utf16: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Defines location of a token inside [`TokenizedFile`] in a way, convenient
|
||||||
|
/// for communicating through LSP.
|
||||||
|
#[derive(Eq, Clone, Copy)]
|
||||||
|
pub struct TokenLocation {
|
||||||
|
line_number: usize,
|
||||||
|
column: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Type for indexing lines in a [`TokenizedFile`].
|
||||||
|
type LineNumber = usize;
|
||||||
|
|
||||||
|
/// Type for specific tokens inside each [`Line`].
|
||||||
|
type TokenIndex = usize;
|
||||||
|
|
||||||
/// Representation of a single physical line of the source file.
|
/// Representation of a single physical line of the source file.
|
||||||
///
|
///
|
||||||
/// [`Range<usize>`] are used instead of slices to avoid creating
|
/// [`Range<TokenIndex>`] are used instead of slices to avoid creating
|
||||||
/// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
|
/// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
enum Line {
|
struct Line {
|
||||||
/// A standalone line that owns a contiguous slice in
|
/// Token that began on an earlier line (`None` for standalone lines).
|
||||||
/// the [`TokenizedFile::buffer`] arena.
|
continued_from: Option<LineNumber>,
|
||||||
Standalone(Range<usize>),
|
/// Contiguous tokens that started on this line (`start >= end` iff empty).
|
||||||
/// A 0-based line that is part of a multi-line token started on
|
local_range: Range<TokenIndex>,
|
||||||
/// another line.
|
|
||||||
Spanned(usize),
|
|
||||||
/// A 0-based line that is part of a multi-line token started on
|
|
||||||
/// another line *and* contains additional tokens local to itself.
|
|
||||||
SpannedWithTokens(usize, Range<usize>),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A tokenized, lossless representation of an UnrealScript source file.
|
/// A tokenized, lossless representation of an UnrealScript source file.
|
||||||
@ -175,7 +184,7 @@ impl<'src> Tokenizer<'src> {
|
|||||||
// exactly `1` interior line)
|
// exactly `1` interior line)
|
||||||
let insert_count = newline_count - 1;
|
let insert_count = newline_count - 1;
|
||||||
for _ in 0..insert_count {
|
for _ in 0..insert_count {
|
||||||
self.lines.push(Line::Spanned(start_line));
|
self.lines.push(Line::spanned(start_line));
|
||||||
}
|
}
|
||||||
// This is called *after* `commit_current_line()` cleared previous
|
// This is called *after* `commit_current_line()` cleared previous
|
||||||
// stored value
|
// stored value
|
||||||
@ -200,9 +209,9 @@ impl<'src> Tokenizer<'src> {
|
|||||||
// `commit_current_line()` only applies it once.
|
// `commit_current_line()` only applies it once.
|
||||||
// This guarantees no "bleed" between adjacent multi-line tokens.
|
// This guarantees no "bleed" between adjacent multi-line tokens.
|
||||||
if let Some(from) = self.multi_line_start.take() {
|
if let Some(from) = self.multi_line_start.take() {
|
||||||
self.lines.push(Line::SpannedWithTokens(from, slice));
|
self.lines.push(Line::spanned_with_tokens(from, slice));
|
||||||
} else {
|
} else {
|
||||||
self.lines.push(Line::Standalone(slice));
|
self.lines.push(Line::standalone(slice));
|
||||||
}
|
}
|
||||||
self.slice_start_index = slice_end;
|
self.slice_start_index = slice_end;
|
||||||
}
|
}
|
||||||
@ -216,7 +225,7 @@ impl<'src> Tokenizer<'src> {
|
|||||||
// If we still have a `spanned_from` (i.e. a pure multi-line token with
|
// If we still have a `spanned_from` (i.e. a pure multi-line token with
|
||||||
// no local tokens on its last line), push a bare `Spanned` entry.
|
// no local tokens on its last line), push a bare `Spanned` entry.
|
||||||
if let Some(from) = self.multi_line_start.take() {
|
if let Some(from) = self.multi_line_start.take() {
|
||||||
self.lines.push(Line::Spanned(from));
|
self.lines.push(Line::spanned(from));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Optimize for size
|
// Optimize for size
|
||||||
@ -251,6 +260,72 @@ fn token_can_span_lines(token: &Token) -> bool {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Line {
|
||||||
|
/// Creates a standalone line that owns a contiguous slice in
|
||||||
|
/// the [`TokenizedFile::buffer`] arena.
|
||||||
|
fn standalone(locals: Range<TokenIndex>) -> Line {
|
||||||
|
Line {
|
||||||
|
continued_from: None,
|
||||||
|
local_range: locals,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a line that is part of a multi-line token started on
|
||||||
|
/// another line, referencing the 0-based index of its origin.
|
||||||
|
fn spanned(carried: LineNumber) -> Line {
|
||||||
|
Line {
|
||||||
|
continued_from: Some(carried),
|
||||||
|
local_range: 0..0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a line that is part of a multi-line token started on
|
||||||
|
/// another line and also contains additional tokens local to itself.
|
||||||
|
fn spanned_with_tokens(carried: LineNumber, locals: Range<TokenIndex>) -> Line {
|
||||||
|
Line {
|
||||||
|
continued_from: Some(carried),
|
||||||
|
local_range: locals,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a range of tokens inside [`TokenizedFile::buffer`] that start
|
||||||
|
/// on this line.
|
||||||
|
///
|
||||||
|
/// [`None`] means there is no such tokens. Otherwise range is guaranteed
|
||||||
|
/// to not be empty.
|
||||||
|
fn local_range(&self) -> Option<Range<TokenIndex>> {
|
||||||
|
if self.local_range.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(self.local_range.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns amount of tokens of the line.
|
||||||
|
///
|
||||||
|
/// Counts both tokens that started on this line and tokens that continued
|
||||||
|
/// from previous one.
|
||||||
|
fn len(&self) -> usize {
|
||||||
|
(self.continued_from.is_some() as usize) + (self.local_range.end - self.local_range.start)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for TokenLocation {
|
||||||
|
fn eq(&self, other: &TokenLocation) -> bool {
|
||||||
|
self.line_number == other.line_number && self.column == other.column
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialOrd for TokenLocation {
|
||||||
|
fn partial_cmp(&self, other: &TokenLocation) -> Option<Ordering> {
|
||||||
|
if self.line_number == other.line_number {
|
||||||
|
self.column.partial_cmp(&other.column)
|
||||||
|
} else {
|
||||||
|
self.line_number.partial_cmp(&other.line_number)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Counts the number of new lines in given text.
|
/// Counts the number of new lines in given text.
|
||||||
fn count_newlines(text: &str) -> usize {
|
fn count_newlines(text: &str) -> usize {
|
||||||
let mut bytes_iterator = text.as_bytes().iter().peekable();
|
let mut bytes_iterator = text.as_bytes().iter().peekable();
|
||||||
|
Loading…
Reference in New Issue
Block a user