Initial commit
This commit is contained in:
commit
4b9d6a6adb
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
/target
|
||||||
|
flamegraph.svg
|
||||||
|
perf.data
|
1104
Cargo.lock
generated
Normal file
1104
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
26
Cargo.toml
Normal file
26
Cargo.toml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
[workspace]
|
||||||
|
resolver = "2"
|
||||||
|
members = ["dev_tests", "rottlsp", "rottlib"]
|
||||||
|
|
||||||
|
[workspace.package]
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[workspace.lints.clippy]
|
||||||
|
all = "warn"
|
||||||
|
nursery = "warn"
|
||||||
|
pedantic = "warn"
|
||||||
|
|
||||||
|
[profile.release]
|
||||||
|
opt-level = 3 # Optimize for speed
|
||||||
|
strip = true # Strip symbols from binary
|
||||||
|
lto = true # Enable link-time optimization
|
||||||
|
panic = "abort" # Abort on panic
|
||||||
|
overflow-checks = false # no integer checks
|
||||||
|
codegen-units = 1 # Reduce number of codegen units to increase optimizations
|
||||||
|
debug = false # strip all debug info
|
||||||
|
|
||||||
|
[profile.flamegraph]
|
||||||
|
inherits = "release" # start from release
|
||||||
|
strip = false
|
||||||
|
debug = true # full DWARF info for unwinding
|
||||||
|
split-debuginfo = "unpacked" # keep symbols inside the binary
|
23
dev_tests/Cargo.toml
Normal file
23
dev_tests/Cargo.toml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
[package]
|
||||||
|
name = "dev_tests"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "dump_tokens"
|
||||||
|
path = "src/dump_tokens.rs"
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "uc_lexer_verify"
|
||||||
|
path = "src/uc_lexer_verify.rs"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
rottlib = { version = "0", path = "../rottlib", features = ["debug"] }
|
||||||
|
walkdir="2.5"
|
||||||
|
encoding_rs="0.8"
|
||||||
|
chardet="0.2"
|
||||||
|
|
||||||
|
[lints]
|
||||||
|
workspace = true
|
76
dev_tests/src/dump_tokens.rs
Normal file
76
dev_tests/src/dump_tokens.rs
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
use std::{
|
||||||
|
fs,
|
||||||
|
path::{Path, PathBuf},
|
||||||
|
};
|
||||||
|
|
||||||
|
use encoding_rs::{Encoding, UTF_8};
|
||||||
|
use rottlib::lexer::{DebugTools, TokenizedFile};
|
||||||
|
|
||||||
|
/// Recursively search `root` for the first file whose *basename* matches
|
||||||
|
/// `needle` (case-sensitive).
|
||||||
|
///
|
||||||
|
/// Returns the absolute path.
|
||||||
|
fn find_file(root: &Path, needle: &str) -> Option<PathBuf> {
|
||||||
|
for entry in walkdir::WalkDir::new(root)
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(Result::ok)
|
||||||
|
{
|
||||||
|
let path = entry.path();
|
||||||
|
if path.is_file() && (path.file_name().and_then(|name| name.to_str()) == Some(needle)) {
|
||||||
|
return fs::canonicalize(path).ok();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// CLI: `dump_tokens <root_dir> <file_name>` - searches for `<file_name>`
|
||||||
|
/// recursively inside `<root_dir>`.
|
||||||
|
///
|
||||||
|
/// This utility takes *root directory* and *file name* instead of the full path
|
||||||
|
/// to help us avoid searching for them typing names out:
|
||||||
|
///
|
||||||
|
/// - We know where all the sources are;
|
||||||
|
/// - We usually just know the name of the file that is being problematic.
|
||||||
|
fn main() {
|
||||||
|
let mut args = std::env::args().skip(1);
|
||||||
|
let root_dir = args.next().unwrap_or_else(|| {
|
||||||
|
eprintln!("Usage: inspect_uc <root_dir> <file_name>");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
let file_name = args.next().unwrap_or_else(|| {
|
||||||
|
eprintln!("Usage: inspect_uc <root_dir> <file_name>");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
let root = PathBuf::from(&root_dir);
|
||||||
|
if !root.exists() {
|
||||||
|
eprintln!("Root directory '{root_dir}' does not exist.");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
let found_path = find_file(&root, &file_name).map_or_else(
|
||||||
|
|| {
|
||||||
|
eprintln!("File '{file_name}' not found under '{root_dir}'.");
|
||||||
|
std::process::exit(1);
|
||||||
|
},
|
||||||
|
|path| path,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Read & decode
|
||||||
|
let raw_bytes = match fs::read(&found_path) {
|
||||||
|
Ok(sources) => sources,
|
||||||
|
Err(error) => {
|
||||||
|
eprintln!("Could not read {}: {error}", found_path.display());
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let (encoding_label, _, _) = chardet::detect(&raw_bytes);
|
||||||
|
let encoding = Encoding::for_label(encoding_label.as_bytes()).unwrap_or(UTF_8);
|
||||||
|
let (decoded_str, _, _) = encoding.decode(&raw_bytes);
|
||||||
|
|
||||||
|
let source_text = decoded_str.to_string();
|
||||||
|
let tokenized_file = TokenizedFile::from_source(&source_text);
|
||||||
|
|
||||||
|
tokenized_file.dump_debug_layout();
|
||||||
|
}
|
122
dev_tests/src/uc_lexer_verify.rs
Normal file
122
dev_tests/src/uc_lexer_verify.rs
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
use std::{collections::HashSet, fs, path::PathBuf};
|
||||||
|
|
||||||
|
use rottlib::lexer::{DebugTools, TokenizedFile};
|
||||||
|
|
||||||
|
/// Read `ignore.txt` (one path per line, `#` for comments) from root directory
|
||||||
|
/// and turn it into a canonicalized [`HashSet<PathBuf>`].
|
||||||
|
fn load_ignore_set(root: &std::path::Path) -> HashSet<PathBuf> {
|
||||||
|
let ignore_file = root.join("ignore.txt");
|
||||||
|
if !ignore_file.exists() {
|
||||||
|
return HashSet::new();
|
||||||
|
}
|
||||||
|
|
||||||
|
let content = match fs::read_to_string(&ignore_file) {
|
||||||
|
Ok(content) => content,
|
||||||
|
Err(error) => {
|
||||||
|
eprintln!("Could not read {}: {error}", ignore_file.display());
|
||||||
|
return HashSet::new();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
content
|
||||||
|
.lines()
|
||||||
|
.map(str::trim)
|
||||||
|
.filter(|line| !line.is_empty() && !line.starts_with('#'))
|
||||||
|
.filter_map(|line| {
|
||||||
|
let next_path = PathBuf::from(line);
|
||||||
|
let absolute_path = if next_path.is_absolute() {
|
||||||
|
next_path
|
||||||
|
} else {
|
||||||
|
root.join(next_path)
|
||||||
|
};
|
||||||
|
fs::canonicalize(absolute_path).ok()
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// CLI: `verify_uc <root_dir>` - find all `.uc` files in the provided directory
|
||||||
|
/// (except those listed in `ignore.txt` in the root) and test them all.
|
||||||
|
///
|
||||||
|
/// Reported execution time is the tokenization time, without considering time
|
||||||
|
/// it takes to read files from disk.
|
||||||
|
///
|
||||||
|
/// `ignore.txt` is for listing specific files, not directories.
|
||||||
|
fn main() {
|
||||||
|
let root_dir = std::env::args().nth(1).unwrap(); // it is fine to crash debug utility
|
||||||
|
let root = PathBuf::from(&root_dir);
|
||||||
|
|
||||||
|
if !root.exists() {
|
||||||
|
eprintln!("Root directory '{root_dir}' does not exist.");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load files
|
||||||
|
let ignored_paths = load_ignore_set(&root);
|
||||||
|
let mut uc_files: Vec<(PathBuf, String)> = Vec::new();
|
||||||
|
for entry in walkdir::WalkDir::new(&root)
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(Result::ok) // for debug tool this is ok
|
||||||
|
.filter(|entry| {
|
||||||
|
let path = entry.path();
|
||||||
|
// Skip anything explicitly ignored
|
||||||
|
if let Ok(absolute_path) = fs::canonicalize(path) {
|
||||||
|
if ignored_paths.contains(&absolute_path) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Must be *.uc
|
||||||
|
path.is_file()
|
||||||
|
&& path
|
||||||
|
.extension()
|
||||||
|
.and_then(|extension| extension.to_str())
|
||||||
|
.is_some_and(|extension| extension.eq_ignore_ascii_case("uc"))
|
||||||
|
})
|
||||||
|
{
|
||||||
|
let path = entry.path();
|
||||||
|
match fs::read(path) {
|
||||||
|
Ok(raw_bytes) => {
|
||||||
|
// Auto‑detect encoding for old Unreal script sources
|
||||||
|
let (encoding_label, _, _) = chardet::detect(&raw_bytes);
|
||||||
|
let encoding = encoding_rs::Encoding::for_label(encoding_label.as_bytes())
|
||||||
|
.unwrap_or(encoding_rs::UTF_8);
|
||||||
|
let (decoded_text, _, _) = encoding.decode(&raw_bytes);
|
||||||
|
uc_files.push((path.to_path_buf(), decoded_text.into_owned()));
|
||||||
|
}
|
||||||
|
Err(error) => {
|
||||||
|
eprintln!("Failed to read `{}`: {error}", path.display());
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
println!("Loaded {} .uc files into memory.", uc_files.len());
|
||||||
|
|
||||||
|
// Tokenize and measure performance
|
||||||
|
let start_time = std::time::Instant::now();
|
||||||
|
let tokenized_files: Vec<(PathBuf, TokenizedFile)> = uc_files
|
||||||
|
.iter()
|
||||||
|
.map(|(path, source_code)| {
|
||||||
|
let tokenized_file = TokenizedFile::from_source(source_code);
|
||||||
|
if tokenized_file.had_errors() {
|
||||||
|
println!("TK: {}", path.display());
|
||||||
|
}
|
||||||
|
(path.clone(), tokenized_file)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let elapsed_time = start_time.elapsed();
|
||||||
|
println!(
|
||||||
|
"Tokenized {} files in {:.2?}",
|
||||||
|
tokenized_files.len(),
|
||||||
|
elapsed_time
|
||||||
|
);
|
||||||
|
|
||||||
|
// Round‑trip check
|
||||||
|
for ((path, original), (_, tokenized_file)) in uc_files.iter().zip(tokenized_files.iter()) {
|
||||||
|
let reconstructed = tokenized_file.reconstruct_source();
|
||||||
|
if original != &reconstructed {
|
||||||
|
eprintln!("Reconstruction mismatch in `{}`!", path.display());
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("All .uc files matched successfully.");
|
||||||
|
}
|
11
rottlib/Cargo.toml
Normal file
11
rottlib/Cargo.toml
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
[package]
|
||||||
|
name = "rottlib"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = []
|
||||||
|
debug = []
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
logos = "0.15"
|
92
rottlib/src/lexer/debug_tools.rs
Normal file
92
rottlib/src/lexer/debug_tools.rs
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
//! Debug-only helpers for [`TokenizedFile`]
|
||||||
|
//!
|
||||||
|
//! This module is **compiled only if**
|
||||||
|
//!
|
||||||
|
//! * the current build profile has `debug_assertions` enabled, or
|
||||||
|
//! * the crate is built with the `debug` cargo feature.
|
||||||
|
//!
|
||||||
|
//! These checks have been moved to the parent module.
|
||||||
|
|
||||||
|
use super::Line;
|
||||||
|
|
||||||
|
/// A technical trait that adds debug helpers to the lexer.
|
||||||
|
pub trait DebugTools {
|
||||||
|
/// Pretty-prints the internal layout of the tokenised file - useful when
|
||||||
|
/// writing new passes or hunting lexer bugs.
|
||||||
|
///
|
||||||
|
/// This method writes the layout directly to standard output.
|
||||||
|
///
|
||||||
|
/// The format is unspecified, may change, and is not intended for
|
||||||
|
/// external tools.
|
||||||
|
///
|
||||||
|
/// Each line in the printed layout starts with its 0-based number for
|
||||||
|
/// convenience.
|
||||||
|
fn dump_debug_layout(&self);
|
||||||
|
|
||||||
|
/// Reconstructs the exact, lossless source text that was fed to
|
||||||
|
/// [`super::TokenizedFile::from_source`] from internal representation -
|
||||||
|
/// useful for manually verifying that the lexer works.
|
||||||
|
fn reconstruct_source(&self) -> String;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'src> DebugTools for super::TokenizedFile<'src> {
|
||||||
|
fn reconstruct_source(&self) -> String {
|
||||||
|
let mut result = String::new();
|
||||||
|
for line in &self.lines {
|
||||||
|
if let Line::Standalone(token_range) | Line::SpannedWithTokens(_, token_range) = line {
|
||||||
|
for span in &self.buffer[token_range.clone()] {
|
||||||
|
result.push_str(span.lexeme);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dump_debug_layout(&self) {
|
||||||
|
for (row_index, line) in self.lines.iter().enumerate() {
|
||||||
|
println!("Line {}", row_index + 1);
|
||||||
|
match line {
|
||||||
|
Line::Standalone(token_range) => {
|
||||||
|
println!("\t[Standalone]");
|
||||||
|
let mut column_utf16 = 0usize;
|
||||||
|
for next_token_span in &self.buffer[token_range.clone()] {
|
||||||
|
let token_beginning = column_utf16;
|
||||||
|
let token_end = column_utf16 + next_token_span.length_utf16;
|
||||||
|
println!(
|
||||||
|
"\t\t{:?} @ {}-{}: {:?}",
|
||||||
|
next_token_span.token,
|
||||||
|
token_beginning,
|
||||||
|
token_end,
|
||||||
|
next_token_span.lexeme
|
||||||
|
);
|
||||||
|
column_utf16 = token_end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Line::Spanned(origin_row) => {
|
||||||
|
// `origin_row` is 0-based
|
||||||
|
println!(
|
||||||
|
"\t[Continued from line {} - no new tokens here]",
|
||||||
|
origin_row + 1
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Line::SpannedWithTokens(origin_row, token_range) => {
|
||||||
|
// `origin_row` is 0-based
|
||||||
|
println!("\t[Continued from line {} + new tokens]", origin_row + 1);
|
||||||
|
let mut column_utf16 = 0usize;
|
||||||
|
for next_token_span in &self.buffer[token_range.clone()] {
|
||||||
|
let token_beginning = column_utf16;
|
||||||
|
let token_end = column_utf16 + next_token_span.length_utf16;
|
||||||
|
println!(
|
||||||
|
"\t\t{:?} @ {}-{}: {:?}",
|
||||||
|
next_token_span.token,
|
||||||
|
token_beginning,
|
||||||
|
token_end,
|
||||||
|
next_token_span.lexeme
|
||||||
|
);
|
||||||
|
column_utf16 = token_end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
476
rottlib/src/lexer/lexing.rs
Normal file
476
rottlib/src/lexer/lexing.rs
Normal file
@ -0,0 +1,476 @@
|
|||||||
|
//! Lexer for UnrealScript that understands inline `cpptext { ... }` blocks.
|
||||||
|
//!
|
||||||
|
//! ## Notable details
|
||||||
|
//!
|
||||||
|
//! Lexer for UnrealScript that recognises inline `cpptext { … }` blocks.
|
||||||
|
//!
|
||||||
|
//! In UnrealScript, `cpptext` lets authors embed raw C++ between braces.
|
||||||
|
//! Because whitespace, newlines, or comments may appear between the
|
||||||
|
//! `cpptext` keyword and the opening `{`, the lexer must remember that
|
||||||
|
//! it has just seen `cpptext` - hence a state machine.
|
||||||
|
//!
|
||||||
|
//! Modes
|
||||||
|
//! ------
|
||||||
|
//! - **Normal** - ordinary UnrealScript tokens.
|
||||||
|
//! - **AwaitingCppBlock** - after `cpptext`, waiting for the next `{`.
|
||||||
|
//!
|
||||||
|
//! When that brace arrives, the lexer consumes the entire C++ block as
|
||||||
|
//! one token (`Token::Brace(BraceKind::CppBlock)`), tracking nested
|
||||||
|
//! braces, strings, and comments on the way. If the closing `}` is
|
||||||
|
//! missing, everything to EOF is treated as C++; downstream parsers must
|
||||||
|
//! handle that gracefully.
|
||||||
|
|
||||||
|
use logos::Lexer;
|
||||||
|
|
||||||
|
/// Which lexer mode we're in. See the module docs for the full story.
|
||||||
|
#[derive(Default, Clone, Copy, PartialEq, Eq)]
|
||||||
|
enum LexerMode {
|
||||||
|
/// Lexing regular UnrealScript.
|
||||||
|
#[default]
|
||||||
|
Normal,
|
||||||
|
/// Saw `cpptext`; waiting for the opening `{` of a C++ block.
|
||||||
|
AwaitingCppBlock,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extra per-lexer state. Currently just holds the [`Mode`].
|
||||||
|
///
|
||||||
|
/// This is a logos-specific implementation detail.
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct LexerState {
|
||||||
|
mode: LexerMode,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Are these braces "real" UnrealScript braces, or the start/end of a C++ block?
|
||||||
|
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||||
|
pub enum BraceKind {
|
||||||
|
Normal,
|
||||||
|
CppBlock,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// All UnrealScript tokens that our compiler distinguishes.
|
||||||
|
#[derive(logos::Logos, Debug, PartialEq, Clone, Copy)]
|
||||||
|
#[logos(extras = LexerState)]
|
||||||
|
pub enum Token {
|
||||||
|
// # Compiler/directive keywords
|
||||||
|
#[regex(r"(?i)#exec[^\r\n]*(\r|\n|\r\n)")]
|
||||||
|
ExecDirective,
|
||||||
|
#[regex("(?i)cpptext", |lex| { lex.extras.mode = LexerMode::AwaitingCppBlock; })]
|
||||||
|
CppText,
|
||||||
|
|
||||||
|
// # Declaration & structural keywords
|
||||||
|
#[regex("(?i)class")]
|
||||||
|
Class,
|
||||||
|
#[regex("(?i)struct")]
|
||||||
|
Struct,
|
||||||
|
#[regex("(?i)enum")]
|
||||||
|
Enum,
|
||||||
|
#[regex("(?i)state")]
|
||||||
|
State,
|
||||||
|
#[regex("(?i)function")]
|
||||||
|
Function,
|
||||||
|
#[regex("(?i)event")]
|
||||||
|
Event,
|
||||||
|
#[regex("(?i)delegate")]
|
||||||
|
Delegate,
|
||||||
|
#[regex("(?i)var")]
|
||||||
|
Var,
|
||||||
|
#[regex("(?i)local")]
|
||||||
|
Local,
|
||||||
|
|
||||||
|
// # Inheritance, interface, dependencies
|
||||||
|
#[regex("(?i)extends")]
|
||||||
|
Extends,
|
||||||
|
#[regex("(?i)dependson")]
|
||||||
|
DependsOn,
|
||||||
|
|
||||||
|
// # Access modifiers & properties
|
||||||
|
#[regex("(?i)private")]
|
||||||
|
Private,
|
||||||
|
#[regex("(?i)protected")]
|
||||||
|
Protected,
|
||||||
|
#[regex("(?i)public")]
|
||||||
|
Public,
|
||||||
|
#[regex("(?i)const")]
|
||||||
|
Const,
|
||||||
|
#[regex("(?i)static")]
|
||||||
|
Static,
|
||||||
|
#[regex("(?i)native")]
|
||||||
|
Native,
|
||||||
|
#[regex("(?i)abstract")]
|
||||||
|
Abstract,
|
||||||
|
#[regex("(?i)deprecated")]
|
||||||
|
Deprecated,
|
||||||
|
|
||||||
|
// # UnrealScript metadata/specifiers
|
||||||
|
#[regex("(?i)default")]
|
||||||
|
Default,
|
||||||
|
#[regex("(?i)defaultproperties")]
|
||||||
|
DefaultProperties,
|
||||||
|
#[regex("(?i)optional")]
|
||||||
|
Optional,
|
||||||
|
#[regex("(?i)config")]
|
||||||
|
Config,
|
||||||
|
#[regex("(?i)perobjectconfig")]
|
||||||
|
PerObjectConfig,
|
||||||
|
#[regex("(?i)globalconfig")]
|
||||||
|
GlobalConfig,
|
||||||
|
#[regex("(?i)collapsecategories")]
|
||||||
|
CollapseCategories,
|
||||||
|
#[regex("(?i)dontcollapsecategories")]
|
||||||
|
DontCollapseCategories,
|
||||||
|
#[regex("(?i)hidecategories")]
|
||||||
|
HideCategories,
|
||||||
|
#[regex("(?i)localized")]
|
||||||
|
Localized,
|
||||||
|
#[regex("(?i)placeable")]
|
||||||
|
Placeable,
|
||||||
|
#[regex("(?i)notplaceable")]
|
||||||
|
NotPlaceable,
|
||||||
|
#[regex("(?i)editinlinenew")]
|
||||||
|
EditInlineNew,
|
||||||
|
#[regex("(?i)noteditinlinenew")]
|
||||||
|
NotEditInlineNew,
|
||||||
|
#[regex("(?i)dynamicrecompile")]
|
||||||
|
DynamicRecompile,
|
||||||
|
#[regex("(?i)transient")]
|
||||||
|
Transient,
|
||||||
|
#[regex("(?i)operator")]
|
||||||
|
Operator,
|
||||||
|
#[regex("(?i)simulated")]
|
||||||
|
Simulated,
|
||||||
|
#[regex("(?i)latent")]
|
||||||
|
Latent,
|
||||||
|
#[regex("(?i)iterator")]
|
||||||
|
Iterator,
|
||||||
|
#[regex("(?i)out")]
|
||||||
|
Out,
|
||||||
|
#[regex("(?i)skip")]
|
||||||
|
Skip,
|
||||||
|
#[regex("(?i)singular")]
|
||||||
|
Singular,
|
||||||
|
#[regex("(?i)coerce")]
|
||||||
|
Coerce,
|
||||||
|
#[regex("(?i)assert")]
|
||||||
|
Assert,
|
||||||
|
#[regex("(?i)ignores")]
|
||||||
|
Ignores,
|
||||||
|
#[regex("(?i)within")]
|
||||||
|
Within,
|
||||||
|
#[regex("(?i)noexport")]
|
||||||
|
NoExport,
|
||||||
|
|
||||||
|
// # Replication-related
|
||||||
|
#[regex("(?i)reliable")]
|
||||||
|
Reliable,
|
||||||
|
#[regex("(?i)unreliable")]
|
||||||
|
Unreliable,
|
||||||
|
#[regex("(?i)replication")]
|
||||||
|
Replication,
|
||||||
|
#[regex("(?i)nativereplication")]
|
||||||
|
NativeReplication,
|
||||||
|
|
||||||
|
// # Control-flow keywords
|
||||||
|
#[regex("(?i)if")]
|
||||||
|
If,
|
||||||
|
#[regex("(?i)else")]
|
||||||
|
Else,
|
||||||
|
#[regex("(?i)switch")]
|
||||||
|
Switch,
|
||||||
|
#[regex("(?i)case")]
|
||||||
|
Case,
|
||||||
|
#[regex("(?i)for")]
|
||||||
|
For,
|
||||||
|
#[regex("(?i)foreach")]
|
||||||
|
ForEach,
|
||||||
|
#[regex("(?i)while")]
|
||||||
|
While,
|
||||||
|
#[regex("(?i)do")]
|
||||||
|
Do,
|
||||||
|
#[regex("(?i)until")]
|
||||||
|
Until,
|
||||||
|
#[regex("(?i)break")]
|
||||||
|
Break,
|
||||||
|
#[regex("(?i)continue")]
|
||||||
|
Continue,
|
||||||
|
#[regex("(?i)return")]
|
||||||
|
Return,
|
||||||
|
|
||||||
|
// # Built-in types
|
||||||
|
#[regex("(?i)int")]
|
||||||
|
Int,
|
||||||
|
#[regex("(?i)float")]
|
||||||
|
Float,
|
||||||
|
#[regex("(?i)bool")]
|
||||||
|
Bool,
|
||||||
|
#[regex("(?i)byte")]
|
||||||
|
Byte,
|
||||||
|
#[regex("(?i)string")]
|
||||||
|
String,
|
||||||
|
#[regex("(?i)array")]
|
||||||
|
Array,
|
||||||
|
#[regex("(?i)name")]
|
||||||
|
Name,
|
||||||
|
|
||||||
|
// # Literals & identifiers
|
||||||
|
#[regex(r"0[xX][0-9A-Fa-f]+|[0-9]+")]
|
||||||
|
IntegerLiteral,
|
||||||
|
#[regex(r"[0-9]*\.[0-9]+([eE][+-]?[0-9]+)?")]
|
||||||
|
FloatLiteral,
|
||||||
|
#[regex(r#""([^"\\\r\n]|\\.)*""#)]
|
||||||
|
StringLiteral,
|
||||||
|
#[regex(r"'[a-zA-Z0-9_\. \-]*'")]
|
||||||
|
NameLiteral,
|
||||||
|
#[regex("(?i)true")]
|
||||||
|
True,
|
||||||
|
#[regex("(?i)false")]
|
||||||
|
False,
|
||||||
|
#[regex("(?i)none")]
|
||||||
|
None,
|
||||||
|
#[regex("(?i)self")]
|
||||||
|
SelfKeyword,
|
||||||
|
#[regex("(?i)new")]
|
||||||
|
New,
|
||||||
|
#[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")]
|
||||||
|
Identifier,
|
||||||
|
|
||||||
|
// # Operations
|
||||||
|
// ## Exponentiation
|
||||||
|
#[token("**")]
|
||||||
|
Exponentiation,
|
||||||
|
// ## Unary
|
||||||
|
#[token("++")]
|
||||||
|
Increment,
|
||||||
|
#[token("--")]
|
||||||
|
Decrement,
|
||||||
|
#[token("!")]
|
||||||
|
Not,
|
||||||
|
#[token("~")]
|
||||||
|
BitwiseNot,
|
||||||
|
// ## Vector
|
||||||
|
#[token("dot")]
|
||||||
|
Dot,
|
||||||
|
#[token("cross")]
|
||||||
|
Cross,
|
||||||
|
// ## Multiplicative
|
||||||
|
#[token("*")]
|
||||||
|
Multiply,
|
||||||
|
#[token("/")]
|
||||||
|
Divide,
|
||||||
|
#[token("%")]
|
||||||
|
Modulo,
|
||||||
|
// ## Additive
|
||||||
|
#[token("+")]
|
||||||
|
Plus,
|
||||||
|
#[token("-")]
|
||||||
|
Minus,
|
||||||
|
// ## String manipulation
|
||||||
|
#[token("@")]
|
||||||
|
AtChar,
|
||||||
|
#[token("$")]
|
||||||
|
DollarChar,
|
||||||
|
// ## Shifts
|
||||||
|
#[token("<<")]
|
||||||
|
LeftShift,
|
||||||
|
#[token(">>>")]
|
||||||
|
LogicalRightShift,
|
||||||
|
#[token(">>")]
|
||||||
|
RightShift,
|
||||||
|
// ## Relational
|
||||||
|
#[token("<")]
|
||||||
|
Less,
|
||||||
|
#[token("<=")]
|
||||||
|
LessEqual,
|
||||||
|
#[token(">")]
|
||||||
|
Greater,
|
||||||
|
#[token(">=")]
|
||||||
|
GreaterEqual,
|
||||||
|
#[token("==")]
|
||||||
|
Equal,
|
||||||
|
#[token("!=")]
|
||||||
|
NotEqual,
|
||||||
|
#[token("~=")]
|
||||||
|
ApproximatelyEqual,
|
||||||
|
// ## Bitwise
|
||||||
|
#[token("&")]
|
||||||
|
BitwiseAnd,
|
||||||
|
#[token("|")]
|
||||||
|
BitwiseOr,
|
||||||
|
#[token("^")]
|
||||||
|
BitwiseXor,
|
||||||
|
#[token("^^")]
|
||||||
|
BooleanXor,
|
||||||
|
// ## Logical
|
||||||
|
#[token("&&")]
|
||||||
|
And,
|
||||||
|
#[token("||")]
|
||||||
|
Or,
|
||||||
|
// ## Assigments
|
||||||
|
#[token("=")]
|
||||||
|
Assign,
|
||||||
|
#[token("*=")]
|
||||||
|
MultiplyAssign,
|
||||||
|
#[token("/=")]
|
||||||
|
DivideAssign,
|
||||||
|
#[token("+=")]
|
||||||
|
PlusAssign,
|
||||||
|
#[token("-=")]
|
||||||
|
MinusAssign,
|
||||||
|
#[token("$=")]
|
||||||
|
ConcatAssign,
|
||||||
|
#[token("@=")]
|
||||||
|
ConcatSpaceAssign,
|
||||||
|
|
||||||
|
// # Punctuation & delimiters
|
||||||
|
#[token("(")]
|
||||||
|
LeftParen,
|
||||||
|
#[token(")")]
|
||||||
|
RightParen,
|
||||||
|
#[token("{", handle_brace)]
|
||||||
|
Brace(BraceKind),
|
||||||
|
#[token("}")]
|
||||||
|
RightBrace,
|
||||||
|
#[token("[")]
|
||||||
|
LeftBracket,
|
||||||
|
#[token("]")]
|
||||||
|
RightBracket,
|
||||||
|
#[token(";")]
|
||||||
|
Semicolon,
|
||||||
|
#[token(",")]
|
||||||
|
Comma,
|
||||||
|
#[token(".")]
|
||||||
|
Period,
|
||||||
|
#[token(":")]
|
||||||
|
Colon,
|
||||||
|
|
||||||
|
// # Comments & whitespaces
|
||||||
|
#[regex(r"//[^\r\n]*")]
|
||||||
|
LineComment,
|
||||||
|
#[regex(r"/\*", handle_block_comment)]
|
||||||
|
BlockComment,
|
||||||
|
#[regex(r"\r\n|\n|\r")]
|
||||||
|
NewLine,
|
||||||
|
#[regex(r"[ \t]+")]
|
||||||
|
Whitespace,
|
||||||
|
|
||||||
|
// # Technical
|
||||||
|
Error,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume a /* ... */ block comment with arbitrary nesting
|
||||||
|
/// (like UnrealScript allows).
|
||||||
|
///
|
||||||
|
/// Matches the whole comment (delimiters included) or [`None`] if the file ends
|
||||||
|
/// before every `/*` is closed.
|
||||||
|
fn handle_block_comment(lexer: &mut Lexer<Token>) -> Option<()> {
|
||||||
|
let mut comment_depth = 1;
|
||||||
|
while let Some(next_char) = lexer.remainder().chars().next() {
|
||||||
|
if lexer.remainder().starts_with("/*") {
|
||||||
|
comment_depth += 1;
|
||||||
|
lexer.bump(2);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if lexer.remainder().starts_with("*/") {
|
||||||
|
comment_depth -= 1;
|
||||||
|
lexer.bump(2);
|
||||||
|
if comment_depth == 0 {
|
||||||
|
return Some(());
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
lexer.bump(next_char.len_utf8());
|
||||||
|
}
|
||||||
|
// Unterminated comment
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Called for every `{`.
|
||||||
|
///
|
||||||
|
/// This method either emits an opening brace or token for `cppblock`,
|
||||||
|
/// depending on lexer's current state.
|
||||||
|
fn handle_brace(lexer: &mut Lexer<Token>) -> Option<BraceKind> {
|
||||||
|
match lexer.extras.mode {
|
||||||
|
LexerMode::Normal => Some(BraceKind::Normal),
|
||||||
|
|
||||||
|
LexerMode::AwaitingCppBlock => {
|
||||||
|
lexer.extras.mode = LexerMode::Normal;
|
||||||
|
consume_cpp_block(lexer);
|
||||||
|
Some(BraceKind::CppBlock)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consumes a complete C++ block, handling:
|
||||||
|
/// - Nested `{...}` pairs
|
||||||
|
/// - String literals (`"..."` and `'...'`), including escaped quotes
|
||||||
|
/// - Line comments (`// ...\n`)
|
||||||
|
/// - Block comments (`/* ... */`)
|
||||||
|
///
|
||||||
|
/// Leaves the lexer positioned immediately after the closing `}` of the block.
|
||||||
|
/// The opening `{` must have already been consumed by the caller.
|
||||||
|
fn consume_cpp_block(lexer: &mut Lexer<Token>) {
|
||||||
|
let mut depth = 1;
|
||||||
|
while let Some(ch) = lexer.remainder().chars().next() {
|
||||||
|
match ch {
|
||||||
|
'{' => {
|
||||||
|
depth += 1;
|
||||||
|
lexer.bump(1);
|
||||||
|
}
|
||||||
|
'}' => {
|
||||||
|
depth -= 1;
|
||||||
|
lexer.bump(1);
|
||||||
|
if depth == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'/' if lexer.remainder().starts_with("/*") => {
|
||||||
|
lexer.bump(2); // consuming two-byte sequence `/*`
|
||||||
|
consume_c_comment(lexer)
|
||||||
|
}
|
||||||
|
'/' if lexer.remainder().starts_with("//") => {
|
||||||
|
lexer.bump(2); // consuming two-byte sequence `//`
|
||||||
|
while let Some(c) = lexer.remainder().chars().next() {
|
||||||
|
lexer.bump(c.len_utf8());
|
||||||
|
if c == '\n' {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'"' | '\'' => {
|
||||||
|
lexer.bump(1); // skip `'` or `"`
|
||||||
|
consume_string_literal(lexer, ch);
|
||||||
|
}
|
||||||
|
_ => lexer.bump(ch.len_utf8()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume over a C-style `/* … */` comment (without nesting).
|
||||||
|
///
|
||||||
|
/// Assumes that opener `/*` is already consumed.
|
||||||
|
fn consume_c_comment(lexer: &mut Lexer<Token>) {
|
||||||
|
while let Some(next_character) = lexer.remainder().chars().next() {
|
||||||
|
if lexer.remainder().starts_with("*/") {
|
||||||
|
lexer.bump(2);
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
lexer.bump(next_character.len_utf8());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume a string literal from C++ code.
|
||||||
|
///
|
||||||
|
/// Assumes that opening quotation mark is already consumed.
|
||||||
|
fn consume_string_literal(lexer: &mut Lexer<Token>, delimiter: char) {
|
||||||
|
while let Some(next_character) = lexer.remainder().chars().next() {
|
||||||
|
lexer.bump(next_character.len_utf8());
|
||||||
|
if next_character == '\\' {
|
||||||
|
// Skip the escaped character
|
||||||
|
if let Some(next) = lexer.remainder().chars().next() {
|
||||||
|
lexer.bump(next.len_utf8());
|
||||||
|
}
|
||||||
|
} else if next_character == delimiter {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
276
rottlib/src/lexer/mod.rs
Normal file
276
rottlib/src/lexer/mod.rs
Normal file
@ -0,0 +1,276 @@
|
|||||||
|
//! # Tokenizer
|
||||||
|
//!
|
||||||
|
//! Converts raw source text into a lossless, position-aware stream of lexical
|
||||||
|
//! [`Token`]s, grouped *per physical line*, and returns it as
|
||||||
|
//! a [`TokenizedFile`].
|
||||||
|
//!
|
||||||
|
//! Design goals:
|
||||||
|
//!
|
||||||
|
//! 1. **Lossless**: preserving complete information for each token, enough to
|
||||||
|
//! recreate the original bytes without loss.
|
||||||
|
//! 2. **LSP readiness**: the LSP heavily relies on UTF-16 support, so we
|
||||||
|
//! precompute lengths of each token in that encoding, making interfacing
|
||||||
|
//! easier.
|
||||||
|
//!
|
||||||
|
//! ## Opt-in debug helpers
|
||||||
|
//!
|
||||||
|
//! Extra diagnostics become available in **debug builds** or when the crate is
|
||||||
|
//! compiled with `debug` feature enabled. They live in the [`DebugTools`]
|
||||||
|
//! extension trait, implemented for [`TokenizedFile`].
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
//! // bring the trait into scope
|
||||||
|
//! use lexer::DebugTools;
|
||||||
|
//!
|
||||||
|
//! let file = TokenizedFile::from_source(src);
|
||||||
|
//! file.debug_dump(); // pretty-print token layout
|
||||||
|
//! let text = file.to_source(); // reconstruct original text
|
||||||
|
//! ```
|
||||||
|
|
||||||
|
mod debug_tools;
|
||||||
|
mod lexing;
|
||||||
|
|
||||||
|
use std::ops::Range;
|
||||||
|
|
||||||
|
use logos::Logos;
|
||||||
|
|
||||||
|
#[cfg(any(debug_assertions, feature = "debug"))]
|
||||||
|
pub use debug_tools::DebugTools;
|
||||||
|
pub use lexing::Token;
|
||||||
|
|
||||||
|
/// Empirically chosen starting size for token buffer (used during tokenization)
|
||||||
|
/// that provides good performance.
|
||||||
|
const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000;
|
||||||
|
|
||||||
|
/// A slice tagged with its token kind plus two length counters.
|
||||||
|
///
|
||||||
|
/// *No absolute coordinates* are stored - they are recomputed per line.
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
struct TokenSpan<'src> {
|
||||||
|
lexeme: &'src str,
|
||||||
|
token: Token,
|
||||||
|
length_utf16: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Representation of a single physical line of the source file.
|
||||||
|
///
|
||||||
|
/// [`Range<usize>`] are used instead of slices to avoid creating
|
||||||
|
/// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
|
||||||
|
#[derive(Clone)]
|
||||||
|
enum Line {
|
||||||
|
/// A standalone line that owns a contiguous slice in
|
||||||
|
/// the [`TokenizedFile::buffer`] arena.
|
||||||
|
Standalone(Range<usize>),
|
||||||
|
/// A 0-based line that is part of a multi-line token started on
|
||||||
|
/// another line.
|
||||||
|
Spanned(usize),
|
||||||
|
/// A 0-based line that is part of a multi-line token started on
|
||||||
|
/// another line *and* contains additional tokens local to itself.
|
||||||
|
SpannedWithTokens(usize, Range<usize>),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A tokenized, lossless representation of an UnrealScript source file.
|
||||||
|
pub struct TokenizedFile<'src> {
|
||||||
|
/// Arena of every token span in this file.
|
||||||
|
buffer: Vec<TokenSpan<'src>>,
|
||||||
|
/// Mapping that provides an easy and efficient access to tokens by
|
||||||
|
/// line number.
|
||||||
|
lines: Vec<Line>,
|
||||||
|
/// Simple flag for marking erroneous state.
|
||||||
|
had_errors: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mutable state that encapsulates data needed during the tokenization loop.
|
||||||
|
struct Tokenizer<'src> {
|
||||||
|
/// Arena that owns every [`TokenSpan`] produced for the file.
|
||||||
|
buffer: Vec<TokenSpan<'src>>,
|
||||||
|
/// Mapping from physical line number to the tokens that belong to it.
|
||||||
|
lines: Vec<Line>,
|
||||||
|
/// The current 0-based physical line number.
|
||||||
|
line_number: usize,
|
||||||
|
/// Index in [`Tokenizer::buffer`] where the current line starts.
|
||||||
|
slice_start_index: usize,
|
||||||
|
/// When a multi-line token is being scanned, stores the 0-based line
|
||||||
|
/// on which it started; [`None`] otherwise.
|
||||||
|
multi_line_start: Option<usize>,
|
||||||
|
/// Set to [`true`] if the lexer reported any error tokens.
|
||||||
|
had_errors: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'src> TokenizedFile<'src> {
|
||||||
|
/// Tokenize `source` and return a fresh [`TokenizedFile`].
|
||||||
|
pub fn from_source(source: &'src str) -> TokenizedFile<'src> {
|
||||||
|
let mut tokenizer = TokenizedFile::<'src>::builder();
|
||||||
|
let mut lexer = Token::lexer(source);
|
||||||
|
|
||||||
|
// Logos > Ok() > token > token span <- plugged into tokenizer
|
||||||
|
while let Some(token_result) = lexer.next() {
|
||||||
|
let token = token_result.unwrap_or_else(|_| {
|
||||||
|
tokenizer.had_errors = true;
|
||||||
|
Token::Error
|
||||||
|
});
|
||||||
|
let token_span = build_span(token, lexer.slice());
|
||||||
|
tokenizer.process_token_span(token_span);
|
||||||
|
}
|
||||||
|
tokenizer.into_tokenized_file()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns [`true`] if any erroneous tokens were produced during building
|
||||||
|
/// of this [`TokenizedFile`].
|
||||||
|
pub fn had_errors(&self) -> bool {
|
||||||
|
self.had_errors
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an empty tokenizer state with tuned buffer capacity.
|
||||||
|
fn builder() -> Tokenizer<'src> {
|
||||||
|
Tokenizer {
|
||||||
|
buffer: Vec::with_capacity(DEFAULT_TOKEN_BUFFER_CAPACITY),
|
||||||
|
lines: Vec::new(),
|
||||||
|
line_number: 0,
|
||||||
|
slice_start_index: 0,
|
||||||
|
multi_line_start: None,
|
||||||
|
had_errors: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'src> Tokenizer<'src> {
|
||||||
|
/// Handles a token span and dispatches to the appropriate handler.
|
||||||
|
fn process_token_span(&mut self, token_span: TokenSpan<'src>) {
|
||||||
|
if token_can_span_lines(&token_span.token) {
|
||||||
|
self.process_multi_line_token(token_span);
|
||||||
|
} else {
|
||||||
|
self.process_single_line_token(token_span);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handles tokens that never span multiple lines.
|
||||||
|
fn process_single_line_token(&mut self, token_span: TokenSpan<'src>) {
|
||||||
|
if token_is_newline(&token_span.token) {
|
||||||
|
self.line_number += 1;
|
||||||
|
self.buffer.push(token_span);
|
||||||
|
self.commit_current_line();
|
||||||
|
} else {
|
||||||
|
self.buffer.push(token_span);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handles tokens that may contain one or more newline characters.
|
||||||
|
fn process_multi_line_token(&mut self, token_span: TokenSpan<'src>) {
|
||||||
|
let start_line = self.line_number;
|
||||||
|
let newline_count = count_newlines(token_span.lexeme);
|
||||||
|
|
||||||
|
// Did this token end in a newline?
|
||||||
|
// This can happen if this is an `Error` token that ends the file.
|
||||||
|
let ends_with_newline =
|
||||||
|
token_span.lexeme.ends_with('\n') || token_span.lexeme.ends_with('\r');
|
||||||
|
|
||||||
|
self.buffer.push(token_span);
|
||||||
|
// We only need to commit the line if this token actually ended the line
|
||||||
|
if newline_count > 0 {
|
||||||
|
self.commit_current_line();
|
||||||
|
// We only need to insert one `Line::Spanned(base)` per *interior*
|
||||||
|
// newline, so `newline_count - 1` such lines
|
||||||
|
// (e.g. 2 line breaks in block comment -> it has
|
||||||
|
// exactly `1` interior line)
|
||||||
|
let insert_count = newline_count - 1;
|
||||||
|
for _ in 0..insert_count {
|
||||||
|
self.lines.push(Line::Spanned(start_line));
|
||||||
|
}
|
||||||
|
// This is called *after* `commit_current_line()` cleared previous
|
||||||
|
// stored value
|
||||||
|
self.multi_line_start = if ends_with_newline {
|
||||||
|
None // we're done at this point
|
||||||
|
} else {
|
||||||
|
Some(start_line)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
self.line_number = start_line + newline_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Commits the tokens of the current physical line into `self.lines`.
|
||||||
|
fn commit_current_line(&mut self) {
|
||||||
|
let slice_end = self.buffer.len();
|
||||||
|
if slice_end > self.slice_start_index {
|
||||||
|
let slice = self.slice_start_index..slice_end;
|
||||||
|
|
||||||
|
// If we were in the middle of a multi-line token, we
|
||||||
|
// *always* consume `spanned_from` here, ensuring that each call to
|
||||||
|
// `commit_current_line()` only applies it once.
|
||||||
|
// This guarantees no "bleed" between adjacent multi-line tokens.
|
||||||
|
if let Some(from) = self.multi_line_start.take() {
|
||||||
|
self.lines.push(Line::SpannedWithTokens(from, slice));
|
||||||
|
} else {
|
||||||
|
self.lines.push(Line::Standalone(slice));
|
||||||
|
}
|
||||||
|
self.slice_start_index = slice_end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Finishes tokenization, converting accumulated data into
|
||||||
|
/// [`TokenizedFile`].
|
||||||
|
fn into_tokenized_file(mut self) -> TokenizedFile<'src> {
|
||||||
|
// Commit any trailing tokens
|
||||||
|
self.commit_current_line();
|
||||||
|
// If we still have a `spanned_from` (i.e. a pure multi-line token with
|
||||||
|
// no local tokens on its last line), push a bare `Spanned` entry.
|
||||||
|
if let Some(from) = self.multi_line_start.take() {
|
||||||
|
self.lines.push(Line::Spanned(from));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Optimize for size
|
||||||
|
self.buffer.shrink_to_fit();
|
||||||
|
self.lines.shrink_to_fit();
|
||||||
|
|
||||||
|
TokenizedFile {
|
||||||
|
buffer: self.buffer,
|
||||||
|
lines: self.lines,
|
||||||
|
had_errors: self.had_errors,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_span<'src>(token: Token, text: &'src str) -> TokenSpan<'src> {
|
||||||
|
let length_utf16 = text.encode_utf16().count();
|
||||||
|
TokenSpan {
|
||||||
|
lexeme: text,
|
||||||
|
token,
|
||||||
|
length_utf16,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_is_newline(token: &Token) -> bool {
|
||||||
|
matches!(token, Token::NewLine)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_can_span_lines(token: &Token) -> bool {
|
||||||
|
matches!(
|
||||||
|
token,
|
||||||
|
Token::BlockComment | Token::Brace(lexing::BraceKind::CppBlock) | Token::Error
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Counts the number of new lines in given text.
|
||||||
|
fn count_newlines(text: &str) -> usize {
|
||||||
|
let mut bytes_iterator = text.as_bytes().iter().peekable();
|
||||||
|
let mut newline_count = 0;
|
||||||
|
while let Some(&next_byte) = bytes_iterator.next() {
|
||||||
|
// Logos' regex rule is "\r\n|\n|\r", so we agree with it on new line
|
||||||
|
// character treatment
|
||||||
|
match next_byte {
|
||||||
|
b'\r' => {
|
||||||
|
newline_count += 1;
|
||||||
|
if let Some(&&b'\n') = bytes_iterator.peek() {
|
||||||
|
// skip the '\n' in a CRLF
|
||||||
|
bytes_iterator.next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b'\n' => {
|
||||||
|
newline_count += 1;
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
newline_count
|
||||||
|
}
|
3
rottlib/src/lib.rs
Normal file
3
rottlib/src/lib.rs
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#![allow(clippy::doc_overindented_list_items)]
|
||||||
|
|
||||||
|
pub mod lexer;
|
12
rottlsp/Cargo.toml
Normal file
12
rottlsp/Cargo.toml
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
[package]
|
||||||
|
name = "rottlsp"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
rottlib = { version = "0", path = "../rottlib" }
|
||||||
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
tower-lsp = "0.20"
|
||||||
|
|
||||||
|
[lints]
|
||||||
|
workspace = true
|
84
rottlsp/src/main.rs
Normal file
84
rottlsp/src/main.rs
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
use tower_lsp::lsp_types;
|
||||||
|
|
||||||
|
/// A Language Server implementation for Rott.
|
||||||
|
///
|
||||||
|
/// Implements the [`tower_lsp::LanguageServer`] trait to handle LSP requests
|
||||||
|
/// (e.g. initialization, text synchronization, open notifications)
|
||||||
|
/// asynchronously.
|
||||||
|
struct RottLanguageServer {
|
||||||
|
/// Client handle for sending notifications and requests to the editor.
|
||||||
|
client: tower_lsp::Client,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tower_lsp::async_trait]
|
||||||
|
impl tower_lsp::LanguageServer for RottLanguageServer {
|
||||||
|
// Inform the client of our server capabilities during initialization.
|
||||||
|
async fn initialize(
|
||||||
|
&self,
|
||||||
|
_: lsp_types::InitializeParams,
|
||||||
|
) -> tower_lsp::jsonrpc::Result<lsp_types::InitializeResult> {
|
||||||
|
Ok(lsp_types::InitializeResult {
|
||||||
|
capabilities: lsp_types::ServerCapabilities {
|
||||||
|
// We can synchronize the text of files, which means we request
|
||||||
|
// to receive full updates whenever a file is opened or changed.
|
||||||
|
// `lsp_types::TextDocumentSyncKind::FULL` means we require full text
|
||||||
|
// every time.
|
||||||
|
text_document_sync: Some(lsp_types::TextDocumentSyncCapability::Kind(
|
||||||
|
lsp_types::TextDocumentSyncKind::FULL,
|
||||||
|
)),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// On file open, tokenize the new document and log any lexing errors.
|
||||||
|
async fn did_open(&self, params: lsp_types::DidOpenTextDocumentParams) {
|
||||||
|
// Measure lexing performance to track parser responsiveness.
|
||||||
|
let start_time = std::time::Instant::now();
|
||||||
|
let has_errors =
|
||||||
|
rottlib::lexer::TokenizedFile::from_source(¶ms.text_document.text).had_errors();
|
||||||
|
let elapsed_time = start_time.elapsed();
|
||||||
|
|
||||||
|
self.client
|
||||||
|
.log_message(
|
||||||
|
lsp_types::MessageType::INFO,
|
||||||
|
format!(
|
||||||
|
"Tokenized {} in {:?}",
|
||||||
|
params.text_document.uri.path(),
|
||||||
|
elapsed_time
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
if has_errors {
|
||||||
|
self.client
|
||||||
|
.log_message(
|
||||||
|
lsp_types::MessageType::INFO,
|
||||||
|
format!(
|
||||||
|
"There was an error while tokenizing {}",
|
||||||
|
params.text_document.uri.path(),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle shutdown signal.
|
||||||
|
async fn shutdown(&self) -> tower_lsp::jsonrpc::Result<()> {
|
||||||
|
// No cleanup required on shutdown; simply acknowledge the request.
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
// We are using standard input and output for communicating with an editor,
|
||||||
|
// so we need to avoid methods or macros that write or read using them,
|
||||||
|
// e.g. `println!`.
|
||||||
|
let (stdin, stdout) = (tokio::io::stdin(), tokio::io::stdout());
|
||||||
|
let (service, socket) = tower_lsp::LspService::new(|client| RottLanguageServer { client });
|
||||||
|
tower_lsp::Server::new(stdin, stdout, socket)
|
||||||
|
.serve(service)
|
||||||
|
.await;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user