Initial commit

2025-07-30 19:46:37 +07:00 · 2025-07-30 19:46:37 +07:00 · 4b9d6a6adb
commit 4b9d6a6adb
13 changed files with 2308 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 /target
 flamegraph.svg
 perf.data
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,26 @@
 [workspace]
 resolver = "2"
 members = ["dev_tests", "rottlsp", "rottlib"]
 [workspace.package]
 edition = "2024"
 [workspace.lints.clippy]
 all = "warn"
 nursery = "warn"
 pedantic = "warn"
 [profile.release]
 opt-level = 3 # Optimize for speed
 strip = true # Strip symbols from binary
 lto = true # Enable link-time optimization
 panic = "abort" # Abort on panic
 overflow-checks = false # no integer checks
 codegen-units = 1 # Reduce number of codegen units to increase optimizations
 debug = false # strip all debug info
 [profile.flamegraph]
 inherits = "release" # start from release
 strip = false
 debug = true # full DWARF info for unwinding
 split-debuginfo = "unpacked" # keep symbols inside the binary
--- a/dev_tests/Cargo.toml
+++ b/dev_tests/Cargo.toml
@ -0,0 +1,23 @@
 [package]
 name = "dev_tests"
 version = "0.1.0"
 edition = "2024"
 [[bin]]
 name = "dump_tokens"
 path = "src/dump_tokens.rs"
 [[bin]]
 name = "uc_lexer_verify"
 path = "src/uc_lexer_verify.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 rottlib = { version = "0", path = "../rottlib", features = ["debug"] }
 walkdir="2.5"
 encoding_rs="0.8"
 chardet="0.2"
 [lints]
 workspace = true
--- a/dev_tests/src/dump_tokens.rs
+++ b/dev_tests/src/dump_tokens.rs
@ -0,0 +1,76 @@
 use std::{
    fs,
    path::{Path, PathBuf},
 };
 use encoding_rs::{Encoding, UTF_8};
 use rottlib::lexer::{DebugTools, TokenizedFile};
 /// Recursively search `root` for the first file whose *basename* matches
 /// `needle` (case-sensitive).
 ///
 /// Returns the absolute path.
 fn find_file(root: &Path, needle: &str) -> Option<PathBuf> {
    for entry in walkdir::WalkDir::new(root)
        .into_iter()
        .filter_map(Result::ok)
    {
        let path = entry.path();
        if path.is_file() && (path.file_name().and_then(|name| name.to_str()) == Some(needle)) {
            return fs::canonicalize(path).ok();
        }
    }
    None
 }
 /// CLI: `dump_tokens <root_dir> <file_name>` - searches for `<file_name>`
 /// recursively inside `<root_dir>`.
 ///
 /// This utility takes *root directory* and *file name* instead of the full path
 /// to help us avoid searching for them typing names out:
 ///
 /// - We know where all the sources are;
 /// - We usually just know the name of the file that is being problematic.
 fn main() {
    let mut args = std::env::args().skip(1);
    let root_dir = args.next().unwrap_or_else(|| {
        eprintln!("Usage: inspect_uc <root_dir> <file_name>");
        std::process::exit(1);
    });
    let file_name = args.next().unwrap_or_else(|| {
        eprintln!("Usage: inspect_uc <root_dir> <file_name>");
        std::process::exit(1);
    });
    let root = PathBuf::from(&root_dir);
    if !root.exists() {
        eprintln!("Root directory '{root_dir}' does not exist.");
        std::process::exit(1);
    }
    let found_path = find_file(&root, &file_name).map_or_else(
        || {
            eprintln!("File '{file_name}' not found under '{root_dir}'.");
            std::process::exit(1);
        },
        |path| path,
    );
    // Read & decode
    let raw_bytes = match fs::read(&found_path) {
        Ok(sources) => sources,
        Err(error) => {
            eprintln!("Could not read {}: {error}", found_path.display());
            std::process::exit(1);
        }
    };
    let (encoding_label, _, _) = chardet::detect(&raw_bytes);
    let encoding = Encoding::for_label(encoding_label.as_bytes()).unwrap_or(UTF_8);
    let (decoded_str, _, _) = encoding.decode(&raw_bytes);
    let source_text = decoded_str.to_string();
    let tokenized_file = TokenizedFile::from_source(&source_text);
    tokenized_file.dump_debug_layout();
 }
--- a/dev_tests/src/uc_lexer_verify.rs
+++ b/dev_tests/src/uc_lexer_verify.rs
@ -0,0 +1,122 @@
 use std::{collections::HashSet, fs, path::PathBuf};
 use rottlib::lexer::{DebugTools, TokenizedFile};
 /// Read `ignore.txt` (one path per line, `#` for comments) from root directory
 /// and turn it into a canonicalized [`HashSet<PathBuf>`].
 fn load_ignore_set(root: &std::path::Path) -> HashSet<PathBuf> {
    let ignore_file = root.join("ignore.txt");
    if !ignore_file.exists() {
        return HashSet::new();
    }
    let content = match fs::read_to_string(&ignore_file) {
        Ok(content) => content,
        Err(error) => {
            eprintln!("Could not read {}: {error}", ignore_file.display());
            return HashSet::new();
        }
    };
    content
        .lines()
        .map(str::trim)
        .filter(|line| !line.is_empty() && !line.starts_with('#'))
        .filter_map(|line| {
            let next_path = PathBuf::from(line);
            let absolute_path = if next_path.is_absolute() {
                next_path
            } else {
                root.join(next_path)
            };
            fs::canonicalize(absolute_path).ok()
        })
        .collect()
 }
 /// CLI: `verify_uc <root_dir>` - find all `.uc` files in the provided directory
 /// (except those listed in `ignore.txt` in the root) and test them all.
 ///
 /// Reported execution time is the tokenization time, without considering time
 /// it takes to read files from disk.
 ///
 /// `ignore.txt` is for listing specific files, not directories.
 fn main() {
    let root_dir = std::env::args().nth(1).unwrap(); // it is fine to crash debug utility
    let root = PathBuf::from(&root_dir);
    if !root.exists() {
        eprintln!("Root directory '{root_dir}' does not exist.");
        std::process::exit(1);
    }
    // Load files
    let ignored_paths = load_ignore_set(&root);
    let mut uc_files: Vec<(PathBuf, String)> = Vec::new();
    for entry in walkdir::WalkDir::new(&root)
        .into_iter()
        .filter_map(Result::ok) // for debug tool this is ok
        .filter(|entry| {
            let path = entry.path();
            // Skip anything explicitly ignored
            if let Ok(absolute_path) = fs::canonicalize(path) {
                if ignored_paths.contains(&absolute_path) {
                    return false;
                }
            }
            // Must be *.uc
            path.is_file()
                && path
                    .extension()
                    .and_then(|extension| extension.to_str())
                    .is_some_and(|extension| extension.eq_ignore_ascii_case("uc"))
        })
    {
        let path = entry.path();
        match fs::read(path) {
            Ok(raw_bytes) => {
                // Auto‑detect encoding for old Unreal script sources
                let (encoding_label, _, _) = chardet::detect(&raw_bytes);
                let encoding = encoding_rs::Encoding::for_label(encoding_label.as_bytes())
                    .unwrap_or(encoding_rs::UTF_8);
                let (decoded_text, _, _) = encoding.decode(&raw_bytes);
                uc_files.push((path.to_path_buf(), decoded_text.into_owned()));
            }
            Err(error) => {
                eprintln!("Failed to read `{}`: {error}", path.display());
                std::process::exit(1);
            }
        }
    }
    println!("Loaded {} .uc files into memory.", uc_files.len());
    // Tokenize and measure performance
    let start_time = std::time::Instant::now();
    let tokenized_files: Vec<(PathBuf, TokenizedFile)> = uc_files
        .iter()
        .map(|(path, source_code)| {
            let tokenized_file = TokenizedFile::from_source(source_code);
            if tokenized_file.had_errors() {
                println!("TK: {}", path.display());
            }
            (path.clone(), tokenized_file)
        })
        .collect();
    let elapsed_time = start_time.elapsed();
    println!(
        "Tokenized {} files in {:.2?}",
        tokenized_files.len(),
        elapsed_time
    );
    // Round‑trip check
    for ((path, original), (_, tokenized_file)) in uc_files.iter().zip(tokenized_files.iter()) {
        let reconstructed = tokenized_file.reconstruct_source();
        if original != &reconstructed {
            eprintln!("Reconstruction mismatch in `{}`!", path.display());
            std::process::exit(1);
        }
    }
    println!("All .uc files matched successfully.");
 }
--- a/rottlib/Cargo.toml
+++ b/rottlib/Cargo.toml
@ -0,0 +1,11 @@
 [package]
 name = "rottlib"
 version = "0.1.0"
 edition = "2024"
 [features]
 default = []
 debug = []
 [dependencies]
 logos = "0.15"
--- a/rottlib/src/lexer/debug_tools.rs
+++ b/rottlib/src/lexer/debug_tools.rs
@ -0,0 +1,92 @@
 //! Debug-only helpers for [`TokenizedFile`]
 //!
 //! This module is **compiled only if**
 //!
 //! * the current build profile has `debug_assertions` enabled, or
 //! * the crate is built with the `debug` cargo feature.
 //!
 //! These checks have been moved to the parent module.
 use super::Line;
 /// A technical trait that adds debug helpers to the lexer.
 pub trait DebugTools {
    /// Pretty-prints the internal layout of the tokenised file - useful when
    /// writing new passes or hunting lexer bugs.
    ///
    /// This method writes the layout directly to standard output.
    ///
    /// The format is unspecified, may change, and is not intended for
    /// external tools.
    ///
    /// Each line in the printed layout starts with its 0-based number for
    /// convenience.
    fn dump_debug_layout(&self);
    /// Reconstructs the exact, lossless source text that was fed to
    /// [`super::TokenizedFile::from_source`] from internal representation -
    /// useful for manually verifying that the lexer works.
    fn reconstruct_source(&self) -> String;
 }
 impl<'src> DebugTools for super::TokenizedFile<'src> {
    fn reconstruct_source(&self) -> String {
        let mut result = String::new();
        for line in &self.lines {
            if let Line::Standalone(token_range) | Line::SpannedWithTokens(_, token_range) = line {
                for span in &self.buffer[token_range.clone()] {
                    result.push_str(span.lexeme);
                }
            }
        }
        result
    }
    fn dump_debug_layout(&self) {
        for (row_index, line) in self.lines.iter().enumerate() {
            println!("Line {}", row_index + 1);
            match line {
                Line::Standalone(token_range) => {
                    println!("\t[Standalone]");
                    let mut column_utf16 = 0usize;
                    for next_token_span in &self.buffer[token_range.clone()] {
                        let token_beginning = column_utf16;
                        let token_end = column_utf16 + next_token_span.length_utf16;
                        println!(
                            "\t\t{:?} @ {}-{}: {:?}",
                            next_token_span.token,
                            token_beginning,
                            token_end,
                            next_token_span.lexeme
                        );
                        column_utf16 = token_end;
                    }
                }
                Line::Spanned(origin_row) => {
                    // `origin_row` is 0-based
                    println!(
                        "\t[Continued from line {} - no new tokens here]",
                        origin_row + 1
                    );
                }
                Line::SpannedWithTokens(origin_row, token_range) => {
                    // `origin_row` is 0-based
                    println!("\t[Continued from line {} + new tokens]", origin_row + 1);
                    let mut column_utf16 = 0usize;
                    for next_token_span in &self.buffer[token_range.clone()] {
                        let token_beginning = column_utf16;
                        let token_end = column_utf16 + next_token_span.length_utf16;
                        println!(
                            "\t\t{:?} @ {}-{}: {:?}",
                            next_token_span.token,
                            token_beginning,
                            token_end,
                            next_token_span.lexeme
                        );
                        column_utf16 = token_end;
                    }
                }
            }
        }
    }
 }
--- a/rottlib/src/lexer/lexing.rs
+++ b/rottlib/src/lexer/lexing.rs
@ -0,0 +1,476 @@
 //! Lexer for UnrealScript that understands inline `cpptext { ... }` blocks.
 //!
 //! ## Notable details
 //!
 //! Lexer for UnrealScript that recognises inline `cpptext { … }` blocks.
 //!
 //! In UnrealScript, `cpptext` lets authors embed raw C++ between braces.  
 //! Because whitespace, newlines, or comments may appear between the
 //! `cpptext` keyword and the opening `{`, the lexer must remember that
 //! it has just seen `cpptext` - hence a state machine.
 //!
 //! Modes
 //! ------
 //! - **Normal** - ordinary UnrealScript tokens.  
 //! - **AwaitingCppBlock** - after `cpptext`, waiting for the next `{`.
 //!
 //! When that brace arrives, the lexer consumes the entire C++ block as
 //! one token (`Token::Brace(BraceKind::CppBlock)`), tracking nested
 //! braces, strings, and comments on the way. If the closing `}` is
 //! missing, everything to EOF is treated as C++; downstream parsers must
 //! handle that gracefully.
 use logos::Lexer;
 /// Which lexer mode we're in. See the module docs for the full story.
 #[derive(Default, Clone, Copy, PartialEq, Eq)]
 enum LexerMode {
    /// Lexing regular UnrealScript.
    #[default]
    Normal,
    /// Saw `cpptext`; waiting for the opening `{` of a C++ block.
    AwaitingCppBlock,
 }
 /// Extra per-lexer state. Currently just holds the [`Mode`].
 ///
 /// This is a logos-specific implementation detail.
 #[derive(Default)]
 pub struct LexerState {
    mode: LexerMode,
 }
 /// Are these braces "real" UnrealScript braces, or the start/end of a C++ block?
 #[derive(Debug, PartialEq, Clone, Copy)]
 pub enum BraceKind {
    Normal,
    CppBlock,
 }
 /// All UnrealScript tokens that our compiler distinguishes.
 #[derive(logos::Logos, Debug, PartialEq, Clone, Copy)]
 #[logos(extras = LexerState)]
 pub enum Token {
    // # Compiler/directive keywords
    #[regex(r"(?i)#exec[^\r\n]*(\r|\n|\r\n)")]
    ExecDirective,
    #[regex("(?i)cpptext", |lex| { lex.extras.mode = LexerMode::AwaitingCppBlock; })]
    CppText,
    // # Declaration & structural keywords
    #[regex("(?i)class")]
    Class,
    #[regex("(?i)struct")]
    Struct,
    #[regex("(?i)enum")]
    Enum,
    #[regex("(?i)state")]
    State,
    #[regex("(?i)function")]
    Function,
    #[regex("(?i)event")]
    Event,
    #[regex("(?i)delegate")]
    Delegate,
    #[regex("(?i)var")]
    Var,
    #[regex("(?i)local")]
    Local,
    // # Inheritance, interface, dependencies
    #[regex("(?i)extends")]
    Extends,
    #[regex("(?i)dependson")]
    DependsOn,
    // # Access modifiers & properties
    #[regex("(?i)private")]
    Private,
    #[regex("(?i)protected")]
    Protected,
    #[regex("(?i)public")]
    Public,
    #[regex("(?i)const")]
    Const,
    #[regex("(?i)static")]
    Static,
    #[regex("(?i)native")]
    Native,
    #[regex("(?i)abstract")]
    Abstract,
    #[regex("(?i)deprecated")]
    Deprecated,
    // # UnrealScript metadata/specifiers
    #[regex("(?i)default")]
    Default,
    #[regex("(?i)defaultproperties")]
    DefaultProperties,
    #[regex("(?i)optional")]
    Optional,
    #[regex("(?i)config")]
    Config,
    #[regex("(?i)perobjectconfig")]
    PerObjectConfig,
    #[regex("(?i)globalconfig")]
    GlobalConfig,
    #[regex("(?i)collapsecategories")]
    CollapseCategories,
    #[regex("(?i)dontcollapsecategories")]
    DontCollapseCategories,
    #[regex("(?i)hidecategories")]
    HideCategories,
    #[regex("(?i)localized")]
    Localized,
    #[regex("(?i)placeable")]
    Placeable,
    #[regex("(?i)notplaceable")]
    NotPlaceable,
    #[regex("(?i)editinlinenew")]
    EditInlineNew,
    #[regex("(?i)noteditinlinenew")]
    NotEditInlineNew,
    #[regex("(?i)dynamicrecompile")]
    DynamicRecompile,
    #[regex("(?i)transient")]
    Transient,
    #[regex("(?i)operator")]
    Operator,
    #[regex("(?i)simulated")]
    Simulated,
    #[regex("(?i)latent")]
    Latent,
    #[regex("(?i)iterator")]
    Iterator,
    #[regex("(?i)out")]
    Out,
    #[regex("(?i)skip")]
    Skip,
    #[regex("(?i)singular")]
    Singular,
    #[regex("(?i)coerce")]
    Coerce,
    #[regex("(?i)assert")]
    Assert,
    #[regex("(?i)ignores")]
    Ignores,
    #[regex("(?i)within")]
    Within,
    #[regex("(?i)noexport")]
    NoExport,
    // # Replication-related
    #[regex("(?i)reliable")]
    Reliable,
    #[regex("(?i)unreliable")]
    Unreliable,
    #[regex("(?i)replication")]
    Replication,
    #[regex("(?i)nativereplication")]
    NativeReplication,
    // # Control-flow keywords
    #[regex("(?i)if")]
    If,
    #[regex("(?i)else")]
    Else,
    #[regex("(?i)switch")]
    Switch,
    #[regex("(?i)case")]
    Case,
    #[regex("(?i)for")]
    For,
    #[regex("(?i)foreach")]
    ForEach,
    #[regex("(?i)while")]
    While,
    #[regex("(?i)do")]
    Do,
    #[regex("(?i)until")]
    Until,
    #[regex("(?i)break")]
    Break,
    #[regex("(?i)continue")]
    Continue,
    #[regex("(?i)return")]
    Return,
    // # Built-in types
    #[regex("(?i)int")]
    Int,
    #[regex("(?i)float")]
    Float,
    #[regex("(?i)bool")]
    Bool,
    #[regex("(?i)byte")]
    Byte,
    #[regex("(?i)string")]
    String,
    #[regex("(?i)array")]
    Array,
    #[regex("(?i)name")]
    Name,
    // # Literals & identifiers
    #[regex(r"0[xX][0-9A-Fa-f]+|[0-9]+")]
    IntegerLiteral,
    #[regex(r"[0-9]*\.[0-9]+([eE][+-]?[0-9]+)?")]
    FloatLiteral,
    #[regex(r#""([^"\\\r\n]|\\.)*""#)]
    StringLiteral,
    #[regex(r"'[a-zA-Z0-9_\. \-]*'")]
    NameLiteral,
    #[regex("(?i)true")]
    True,
    #[regex("(?i)false")]
    False,
    #[regex("(?i)none")]
    None,
    #[regex("(?i)self")]
    SelfKeyword,
    #[regex("(?i)new")]
    New,
    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")]
    Identifier,
    // # Operations
    // ## Exponentiation
    #[token("**")]
    Exponentiation,
    // ## Unary
    #[token("++")]
    Increment,
    #[token("--")]
    Decrement,
    #[token("!")]
    Not,
    #[token("~")]
    BitwiseNot,
    // ## Vector
    #[token("dot")]
    Dot,
    #[token("cross")]
    Cross,
    // ## Multiplicative
    #[token("*")]
    Multiply,
    #[token("/")]
    Divide,
    #[token("%")]
    Modulo,
    // ## Additive
    #[token("+")]
    Plus,
    #[token("-")]
    Minus,
    // ## String manipulation
    #[token("@")]
    AtChar,
    #[token("$")]
    DollarChar,
    // ## Shifts
    #[token("<<")]
    LeftShift,
    #[token(">>>")]
    LogicalRightShift,
    #[token(">>")]
    RightShift,
    // ## Relational
    #[token("<")]
    Less,
    #[token("<=")]
    LessEqual,
    #[token(">")]
    Greater,
    #[token(">=")]
    GreaterEqual,
    #[token("==")]
    Equal,
    #[token("!=")]
    NotEqual,
    #[token("~=")]
    ApproximatelyEqual,
    // ## Bitwise
    #[token("&")]
    BitwiseAnd,
    #[token("|")]
    BitwiseOr,
    #[token("^")]
    BitwiseXor,
    #[token("^^")]
    BooleanXor,
    // ## Logical
    #[token("&&")]
    And,
    #[token("||")]
    Or,
    // ## Assigments
    #[token("=")]
    Assign,
    #[token("*=")]
    MultiplyAssign,
    #[token("/=")]
    DivideAssign,
    #[token("+=")]
    PlusAssign,
    #[token("-=")]
    MinusAssign,
    #[token("$=")]
    ConcatAssign,
    #[token("@=")]
    ConcatSpaceAssign,
    // # Punctuation & delimiters
    #[token("(")]
    LeftParen,
    #[token(")")]
    RightParen,
    #[token("{", handle_brace)]
    Brace(BraceKind),
    #[token("}")]
    RightBrace,
    #[token("[")]
    LeftBracket,
    #[token("]")]
    RightBracket,
    #[token(";")]
    Semicolon,
    #[token(",")]
    Comma,
    #[token(".")]
    Period,
    #[token(":")]
    Colon,
    // # Comments & whitespaces
    #[regex(r"//[^\r\n]*")]
    LineComment,
    #[regex(r"/\*", handle_block_comment)]
    BlockComment,
    #[regex(r"\r\n|\n|\r")]
    NewLine,
    #[regex(r"[ \t]+")]
    Whitespace,
    // # Technical
    Error,
 }
 /// Consume a /* ... */ block comment with arbitrary nesting
 /// (like UnrealScript allows).
 ///
 /// Matches the whole comment (delimiters included) or [`None`] if the file ends
 /// before every `/*` is closed.
 fn handle_block_comment(lexer: &mut Lexer<Token>) -> Option<()> {
    let mut comment_depth = 1;
    while let Some(next_char) = lexer.remainder().chars().next() {
        if lexer.remainder().starts_with("/*") {
            comment_depth += 1;
            lexer.bump(2);
            continue;
        }
        if lexer.remainder().starts_with("*/") {
            comment_depth -= 1;
            lexer.bump(2);
            if comment_depth == 0 {
                return Some(());
            }
            continue;
        }
        lexer.bump(next_char.len_utf8());
    }
    // Unterminated comment
    None
 }
 /// Called for every `{`.
 ///
 /// This method either emits an opening brace or token for `cppblock`,
 /// depending on lexer's current state.
 fn handle_brace(lexer: &mut Lexer<Token>) -> Option<BraceKind> {
    match lexer.extras.mode {
        LexerMode::Normal => Some(BraceKind::Normal),
        LexerMode::AwaitingCppBlock => {
            lexer.extras.mode = LexerMode::Normal;
            consume_cpp_block(lexer);
            Some(BraceKind::CppBlock)
        }
    }
 }
 /// Consumes a complete C++ block, handling:
 ///   - Nested `{...}` pairs
 ///   - String literals (`"..."` and `'...'`), including escaped quotes
 ///   - Line comments (`// ...\n`)
 ///   - Block comments (`/* ... */`)
 ///
 /// Leaves the lexer positioned immediately after the closing `}` of the block.
 /// The opening `{` must have already been consumed by the caller.
 fn consume_cpp_block(lexer: &mut Lexer<Token>) {
    let mut depth = 1;
    while let Some(ch) = lexer.remainder().chars().next() {
        match ch {
            '{' => {
                depth += 1;
                lexer.bump(1);
            }
            '}' => {
                depth -= 1;
                lexer.bump(1);
                if depth == 0 {
                    break;
                }
            }
            '/' if lexer.remainder().starts_with("/*") => {
                lexer.bump(2); // consuming two-byte sequence `/*`
                consume_c_comment(lexer)
            }
            '/' if lexer.remainder().starts_with("//") => {
                lexer.bump(2); // consuming two-byte sequence `//`
                while let Some(c) = lexer.remainder().chars().next() {
                    lexer.bump(c.len_utf8());
                    if c == '\n' {
                        break;
                    }
                }
            }
            '"' | '\'' => {
                lexer.bump(1); // skip  `'` or `"`
                consume_string_literal(lexer, ch);
            }
            _ => lexer.bump(ch.len_utf8()),
        }
    }
 }
 /// Consume over a C-style `/* … */` comment (without nesting).
 ///
 /// Assumes that opener `/*` is already consumed.
 fn consume_c_comment(lexer: &mut Lexer<Token>) {
    while let Some(next_character) = lexer.remainder().chars().next() {
        if lexer.remainder().starts_with("*/") {
            lexer.bump(2);
            break;
        } else {
            lexer.bump(next_character.len_utf8());
        }
    }
 }
 /// Consume a string literal from C++ code.
 ///
 /// Assumes that opening quotation mark is already consumed.
 fn consume_string_literal(lexer: &mut Lexer<Token>, delimiter: char) {
    while let Some(next_character) = lexer.remainder().chars().next() {
        lexer.bump(next_character.len_utf8());
        if next_character == '\\' {
            // Skip the escaped character
            if let Some(next) = lexer.remainder().chars().next() {
                lexer.bump(next.len_utf8());
            }
        } else if next_character == delimiter {
            return;
        }
    }
 }
--- a/rottlib/src/lexer/mod.rs
+++ b/rottlib/src/lexer/mod.rs
@ -0,0 +1,276 @@
 //! # Tokenizer
 //!
 //! Converts raw source text into a lossless, position-aware stream of lexical
 //! [`Token`]s, grouped *per physical line*, and returns it as
 //! a [`TokenizedFile`].
 //!
 //! Design goals:
 //!
 //! 1. **Lossless**: preserving complete information for each token, enough to
 //!     recreate the original bytes without loss.
 //! 2. **LSP readiness**: the LSP heavily relies on UTF-16 support, so we
 //!     precompute lengths of each token in that encoding, making interfacing
 //!     easier.
 //!
 //! ## Opt-in debug helpers
 //!
 //! Extra diagnostics become available in **debug builds** or when the crate is
 //! compiled with `debug` feature enabled. They live in the [`DebugTools`]
 //! extension trait, implemented for [`TokenizedFile`].
 //!
 //! ```
 //! // bring the trait into scope
 //! use lexer::DebugTools;
 //!
 //! let file = TokenizedFile::from_source(src);
 //! file.debug_dump();              // pretty-print token layout
 //! let text = file.to_source();    // reconstruct original text
 //! ```
 mod debug_tools;
 mod lexing;
 use std::ops::Range;
 use logos::Logos;
 #[cfg(any(debug_assertions, feature = "debug"))]
 pub use debug_tools::DebugTools;
 pub use lexing::Token;
 /// Empirically chosen starting size for token buffer (used during tokenization)
 /// that provides good performance.
 const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000;
 /// A slice tagged with its token kind plus two length counters.
 ///
 /// *No absolute coordinates* are stored - they are recomputed per line.
 #[derive(Debug, Clone, Copy)]
 struct TokenSpan<'src> {
    lexeme: &'src str,
    token: Token,
    length_utf16: usize,
 }
 /// Representation of a single physical line of the source file.
 ///
 /// [`Range<usize>`] are used instead of slices to avoid creating
 /// a self-referential struct (with [`TokenizedFile`]), which rust forbids.
 #[derive(Clone)]
 enum Line {
    /// A standalone line that owns a contiguous slice in
    /// the [`TokenizedFile::buffer`] arena.
    Standalone(Range<usize>),
    /// A 0-based line that is part of a multi-line token started on
    /// another line.
    Spanned(usize),
    /// A 0-based line that is part of a multi-line token started on
    /// another line *and* contains additional tokens local to itself.
    SpannedWithTokens(usize, Range<usize>),
 }
 /// A tokenized, lossless representation of an UnrealScript source file.
 pub struct TokenizedFile<'src> {
    /// Arena of every token span in this file.
    buffer: Vec<TokenSpan<'src>>,
    /// Mapping that provides an easy and efficient access to tokens by
    /// line number.
    lines: Vec<Line>,
    /// Simple flag for marking erroneous state.
    had_errors: bool,
 }
 /// Mutable state that encapsulates data needed during the tokenization loop.
 struct Tokenizer<'src> {
    /// Arena that owns every [`TokenSpan`] produced for the file.
    buffer: Vec<TokenSpan<'src>>,
    /// Mapping from physical line number to the tokens that belong to it.
    lines: Vec<Line>,
    /// The current 0-based physical line number.
    line_number: usize,
    /// Index in [`Tokenizer::buffer`] where the current line starts.
    slice_start_index: usize,
    /// When a multi-line token is being scanned, stores the 0-based line
    /// on which it started; [`None`] otherwise.
    multi_line_start: Option<usize>,
    /// Set to [`true`] if the lexer reported any error tokens.
    had_errors: bool,
 }
 impl<'src> TokenizedFile<'src> {
    /// Tokenize `source` and return a fresh [`TokenizedFile`].
    pub fn from_source(source: &'src str) -> TokenizedFile<'src> {
        let mut tokenizer = TokenizedFile::<'src>::builder();
        let mut lexer = Token::lexer(source);
        // Logos > Ok() > token > token span <- plugged into tokenizer
        while let Some(token_result) = lexer.next() {
            let token = token_result.unwrap_or_else(|_| {
                tokenizer.had_errors = true;
                Token::Error
            });
            let token_span = build_span(token, lexer.slice());
            tokenizer.process_token_span(token_span);
        }
        tokenizer.into_tokenized_file()
    }
    /// Returns [`true`] if any erroneous tokens were produced during building
    /// of this [`TokenizedFile`].
    pub fn had_errors(&self) -> bool {
        self.had_errors
    }
    /// Create an empty tokenizer state with tuned buffer capacity.
    fn builder() -> Tokenizer<'src> {
        Tokenizer {
            buffer: Vec::with_capacity(DEFAULT_TOKEN_BUFFER_CAPACITY),
            lines: Vec::new(),
            line_number: 0,
            slice_start_index: 0,
            multi_line_start: None,
            had_errors: false,
        }
    }
 }
 impl<'src> Tokenizer<'src> {
    /// Handles a token span and dispatches to the appropriate handler.
    fn process_token_span(&mut self, token_span: TokenSpan<'src>) {
        if token_can_span_lines(&token_span.token) {
            self.process_multi_line_token(token_span);
        } else {
            self.process_single_line_token(token_span);
        }
    }
    /// Handles tokens that never span multiple lines.
    fn process_single_line_token(&mut self, token_span: TokenSpan<'src>) {
        if token_is_newline(&token_span.token) {
            self.line_number += 1;
            self.buffer.push(token_span);
            self.commit_current_line();
        } else {
            self.buffer.push(token_span);
        }
    }
    /// Handles tokens that may contain one or more newline characters.
    fn process_multi_line_token(&mut self, token_span: TokenSpan<'src>) {
        let start_line = self.line_number;
        let newline_count = count_newlines(token_span.lexeme);
        // Did this token end in a newline?
        // This can happen if this is an `Error` token that ends the file.
        let ends_with_newline =
            token_span.lexeme.ends_with('\n') || token_span.lexeme.ends_with('\r');
        self.buffer.push(token_span);
        // We only need to commit the line if this token actually ended the line
        if newline_count > 0 {
            self.commit_current_line();
            // We only need to insert one `Line::Spanned(base)` per *interior*
            // newline, so `newline_count - 1` such lines
            // (e.g. 2 line breaks in block comment -> it has
            // exactly `1` interior line)
            let insert_count = newline_count - 1;
            for _ in 0..insert_count {
                self.lines.push(Line::Spanned(start_line));
            }
            // This is called *after* `commit_current_line()` cleared previous
            // stored value
            self.multi_line_start = if ends_with_newline {
                None // we're done at this point
            } else {
                Some(start_line)
            };
        }
        self.line_number = start_line + newline_count;
    }
    /// Commits the tokens of the current physical line into `self.lines`.
    fn commit_current_line(&mut self) {
        let slice_end = self.buffer.len();
        if slice_end > self.slice_start_index {
            let slice = self.slice_start_index..slice_end;
            // If we were in the middle of a multi-line token, we
            // *always* consume `spanned_from` here, ensuring that each call to
            // `commit_current_line()` only applies it once.
            // This guarantees no "bleed" between adjacent multi-line tokens.
            if let Some(from) = self.multi_line_start.take() {
                self.lines.push(Line::SpannedWithTokens(from, slice));
            } else {
                self.lines.push(Line::Standalone(slice));
            }
            self.slice_start_index = slice_end;
        }
    }
    /// Finishes tokenization, converting accumulated data into
    /// [`TokenizedFile`].
    fn into_tokenized_file(mut self) -> TokenizedFile<'src> {
        // Commit any trailing tokens
        self.commit_current_line();
        // If we still have a `spanned_from` (i.e. a pure multi-line token with
        // no local tokens on its last line), push a bare `Spanned` entry.
        if let Some(from) = self.multi_line_start.take() {
            self.lines.push(Line::Spanned(from));
        }
        // Optimize for size
        self.buffer.shrink_to_fit();
        self.lines.shrink_to_fit();
        TokenizedFile {
            buffer: self.buffer,
            lines: self.lines,
            had_errors: self.had_errors,
        }
    }
 }
 fn build_span<'src>(token: Token, text: &'src str) -> TokenSpan<'src> {
    let length_utf16 = text.encode_utf16().count();
    TokenSpan {
        lexeme: text,
        token,
        length_utf16,
    }
 }
 fn token_is_newline(token: &Token) -> bool {
    matches!(token, Token::NewLine)
 }
 fn token_can_span_lines(token: &Token) -> bool {
    matches!(
        token,
        Token::BlockComment | Token::Brace(lexing::BraceKind::CppBlock) | Token::Error
    )
 }
 /// Counts the number of new lines in given text.
 fn count_newlines(text: &str) -> usize {
    let mut bytes_iterator = text.as_bytes().iter().peekable();
    let mut newline_count = 0;
    while let Some(&next_byte) = bytes_iterator.next() {
        // Logos' regex rule is "\r\n|\n|\r", so we agree with it on new line
        // character treatment
        match next_byte {
            b'\r' => {
                newline_count += 1;
                if let Some(&&b'\n') = bytes_iterator.peek() {
                    // skip the '\n' in a CRLF
                    bytes_iterator.next();
                }
            }
            b'\n' => {
                newline_count += 1;
            }
            _ => (),
        }
    }
    newline_count
 }
--- a/rottlib/src/lib.rs
+++ b/rottlib/src/lib.rs
@ -0,0 +1,3 @@
 #![allow(clippy::doc_overindented_list_items)]
 pub mod lexer;
--- a/rottlsp/Cargo.toml
+++ b/rottlsp/Cargo.toml
@ -0,0 +1,12 @@
 [package]
 name = "rottlsp"
 version = "0.1.0"
 edition = "2024"
 [dependencies]
 rottlib = { version = "0", path = "../rottlib" }
 tokio = { version = "1", features = ["full"] }
 tower-lsp = "0.20"
 [lints]
 workspace = true
--- a/rottlsp/src/main.rs
+++ b/rottlsp/src/main.rs
@ -0,0 +1,84 @@
 use tower_lsp::lsp_types;
 /// A Language Server implementation for Rott.
 ///  
 /// Implements the [`tower_lsp::LanguageServer`] trait to handle LSP requests
 /// (e.g. initialization, text synchronization, open notifications)
 /// asynchronously.
 struct RottLanguageServer {
    /// Client handle for sending notifications and requests to the editor.
    client: tower_lsp::Client,
 }
 #[tower_lsp::async_trait]
 impl tower_lsp::LanguageServer for RottLanguageServer {
    // Inform the client of our server capabilities during initialization.
    async fn initialize(
        &self,
        _: lsp_types::InitializeParams,
    ) -> tower_lsp::jsonrpc::Result<lsp_types::InitializeResult> {
        Ok(lsp_types::InitializeResult {
            capabilities: lsp_types::ServerCapabilities {
                // We can synchronize the text of files, which means we request
                // to receive full updates whenever a file is opened or changed.
                // `lsp_types::TextDocumentSyncKind::FULL` means we require full text
                // every time.
                text_document_sync: Some(lsp_types::TextDocumentSyncCapability::Kind(
                    lsp_types::TextDocumentSyncKind::FULL,
                )),
                ..Default::default()
            },
            ..Default::default()
        })
    }
    // On file open, tokenize the new document and log any lexing errors.
    async fn did_open(&self, params: lsp_types::DidOpenTextDocumentParams) {
        // Measure lexing performance to track parser responsiveness.
        let start_time = std::time::Instant::now();
        let has_errors =
            rottlib::lexer::TokenizedFile::from_source(&params.text_document.text).had_errors();
        let elapsed_time = start_time.elapsed();
        self.client
            .log_message(
                lsp_types::MessageType::INFO,
                format!(
                    "Tokenized {} in {:?}",
                    params.text_document.uri.path(),
                    elapsed_time
                ),
            )
            .await;
        if has_errors {
            self.client
                .log_message(
                    lsp_types::MessageType::INFO,
                    format!(
                        "There was an error while tokenizing {}",
                        params.text_document.uri.path(),
                    ),
                )
                .await;
        }
    }
    // Handle shutdown signal.
    async fn shutdown(&self) -> tower_lsp::jsonrpc::Result<()> {
        // No cleanup required on shutdown; simply acknowledge the request.
        Ok(())
    }
 }
 #[tokio::main]
 async fn main() {
    // We are using standard input and output for communicating with an editor,
    // so we need to avoid methods or macros that write or read using them,
    // e.g. `println!`.
    let (stdin, stdout) = (tokio::io::stdin(), tokio::io::stdout());
    let (service, socket) = tower_lsp::LspService::new(|client| RottLanguageServer { client });
    tower_lsp::Server::new(stdin, stdout, socket)
        .serve(service)
        .await;
 }
		`@ -0,0 +1,3 @@`
							`#![allow(clippy::doc_overindented_list_items)]`

							`pub mod lexer;`