From 4b9d6a6adbf80573f79ae9b15fe642359c490af8 Mon Sep 17 00:00:00 2001 From: dkanus Date: Wed, 30 Jul 2025 19:46:37 +0700 Subject: [PATCH] Initial commit --- .gitignore | 3 + Cargo.lock | 1104 ++++++++++++++++++++++++++++++ Cargo.toml | 26 + dev_tests/Cargo.toml | 23 + dev_tests/src/dump_tokens.rs | 76 ++ dev_tests/src/uc_lexer_verify.rs | 122 ++++ rottlib/Cargo.toml | 11 + rottlib/src/lexer/debug_tools.rs | 92 +++ rottlib/src/lexer/lexing.rs | 476 +++++++++++++ rottlib/src/lexer/mod.rs | 276 ++++++++ rottlib/src/lib.rs | 3 + rottlsp/Cargo.toml | 12 + rottlsp/src/main.rs | 84 +++ 13 files changed, 2308 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 dev_tests/Cargo.toml create mode 100644 dev_tests/src/dump_tokens.rs create mode 100644 dev_tests/src/uc_lexer_verify.rs create mode 100644 rottlib/Cargo.toml create mode 100644 rottlib/src/lexer/debug_tools.rs create mode 100644 rottlib/src/lexer/lexing.rs create mode 100644 rottlib/src/lexer/mod.rs create mode 100644 rottlib/src/lib.rs create mode 100644 rottlsp/Cargo.toml create mode 100644 rottlsp/src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c589f7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target +flamegraph.svg +perf.data diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..f6bfb73 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,1104 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "async-trait" +version = "0.1.88" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "auto_impl" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffdcb70bdbc4d478427380519163274ac86e52916e10f0a8889adf0f96d3fee7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "backtrace" +version = "0.3.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets", +] + +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + +[[package]] +name = "cfg-if" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" + +[[package]] +name = "chardet" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a48563284b67c003ba0fb7243c87fab68885e1532c605704228a80238512e31" + +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "dev_tests" +version = "0.1.0" +dependencies = [ + "chardet", + "encoding_rs", + "rottlib", + "walkdir", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "icu_collections" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" + +[[package]] +name = "icu_properties" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "potential_utf", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" + +[[package]] +name = "icu_provider" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +dependencies = [ + "displaydoc", + "icu_locale_core", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "io-uring" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d93587f37623a1a17d94ef2bc9ada592f5465fe7732084ab7beefabe5c77c0c4" +dependencies = [ + "bitflags 2.9.1", + "cfg-if", + "libc", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.174" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" + +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" + +[[package]] +name = "lock_api" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "logos" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab6f536c1af4c7cc81edf73da1f8029896e7e1e16a219ef09b184e76a296f3db" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "189bbfd0b61330abea797e5e9276408f2edbe4f822d7ad08685d67419aafb34e" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax", + "rustc_version", + "syn", +] + +[[package]] +name = "logos-derive" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebfe8e1a19049ddbfccbd14ac834b215e11b85b90bab0c2dba7c7b92fb5d5cba" +dependencies = [ + "logos-codegen", +] + +[[package]] +name = "lsp-types" +version = "0.94.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66bfd44a06ae10647fe3f8214762e9369fd4248df1350924b4ef9e770a85ea1" +dependencies = [ + "bitflags 1.3.2", + "serde", + "serde_json", + "serde_repr", + "url", +] + +[[package]] +name = "memchr" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", +] + +[[package]] +name = "mio" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +dependencies = [ + "libc", + "wasi", + "windows-sys", +] + +[[package]] +name = "object" +version = "0.36.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "parking_lot" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "pin-project" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "potential_utf" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +dependencies = [ + "zerovec", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.5.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" +dependencies = [ + "bitflags 2.9.1", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rottlib" +version = "0.1.0" +dependencies = [ + "logos", +] + +[[package]] +name = "rottlsp" +version = "0.1.0" +dependencies = [ + "rottlib", + "tokio", + "tower-lsp", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.141" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b9eff21ebe718216c6ec64e1d9ac57087aad11efc64e32002bce4a0d4c03d3" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410" +dependencies = [ + "libc", +] + +[[package]] +name = "slab" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04dc19736151f35336d325007ac991178d504a119863a2fcb3758cdb5e52c50d" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "socket2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "syn" +version = "2.0.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tokio" +version = "1.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43864ed400b6043a4757a25c7a64a8efde741aed79a056a2fb348a406701bb35" +dependencies = [ + "backtrace", + "bytes", + "io-uring", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "slab", + "socket2", + "tokio-macros", + "windows-sys", +] + +[[package]] +name = "tokio-macros" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-util" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "pin-project", + "pin-project-lite", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-lsp" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4ba052b54a6627628d9b3c34c176e7eda8359b7da9acd497b9f20998d118508" +dependencies = [ + "async-trait", + "auto_impl", + "bytes", + "dashmap", + "futures", + "httparse", + "lsp-types", + "memchr", + "serde", + "serde_json", + "tokio", + "tokio-util", + "tower", + "tower-lsp-macros", + "tracing", +] + +[[package]] +name = "tower-lsp-macros" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84fd902d4e0b9a4b27f2f440108dc034e1758628a9b702f8ec61ad66355422fa" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" +dependencies = [ + "once_cell", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "url" +version = "2.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "writeable" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" + +[[package]] +name = "yoke" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..4e9485c --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,26 @@ +[workspace] +resolver = "2" +members = ["dev_tests", "rottlsp", "rottlib"] + +[workspace.package] +edition = "2024" + +[workspace.lints.clippy] +all = "warn" +nursery = "warn" +pedantic = "warn" + +[profile.release] +opt-level = 3 # Optimize for speed +strip = true # Strip symbols from binary +lto = true # Enable link-time optimization +panic = "abort" # Abort on panic +overflow-checks = false # no integer checks +codegen-units = 1 # Reduce number of codegen units to increase optimizations +debug = false # strip all debug info + +[profile.flamegraph] +inherits = "release" # start from release +strip = false +debug = true # full DWARF info for unwinding +split-debuginfo = "unpacked" # keep symbols inside the binary diff --git a/dev_tests/Cargo.toml b/dev_tests/Cargo.toml new file mode 100644 index 0000000..7c528e3 --- /dev/null +++ b/dev_tests/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "dev_tests" +version = "0.1.0" +edition = "2024" + +[[bin]] +name = "dump_tokens" +path = "src/dump_tokens.rs" + +[[bin]] +name = "uc_lexer_verify" +path = "src/uc_lexer_verify.rs" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +rottlib = { version = "0", path = "../rottlib", features = ["debug"] } +walkdir="2.5" +encoding_rs="0.8" +chardet="0.2" + +[lints] +workspace = true \ No newline at end of file diff --git a/dev_tests/src/dump_tokens.rs b/dev_tests/src/dump_tokens.rs new file mode 100644 index 0000000..5f25c0a --- /dev/null +++ b/dev_tests/src/dump_tokens.rs @@ -0,0 +1,76 @@ +use std::{ + fs, + path::{Path, PathBuf}, +}; + +use encoding_rs::{Encoding, UTF_8}; +use rottlib::lexer::{DebugTools, TokenizedFile}; + +/// Recursively search `root` for the first file whose *basename* matches +/// `needle` (case-sensitive). +/// +/// Returns the absolute path. +fn find_file(root: &Path, needle: &str) -> Option { + for entry in walkdir::WalkDir::new(root) + .into_iter() + .filter_map(Result::ok) + { + let path = entry.path(); + if path.is_file() && (path.file_name().and_then(|name| name.to_str()) == Some(needle)) { + return fs::canonicalize(path).ok(); + } + } + None +} + +/// CLI: `dump_tokens ` - searches for `` +/// recursively inside ``. +/// +/// This utility takes *root directory* and *file name* instead of the full path +/// to help us avoid searching for them typing names out: +/// +/// - We know where all the sources are; +/// - We usually just know the name of the file that is being problematic. +fn main() { + let mut args = std::env::args().skip(1); + let root_dir = args.next().unwrap_or_else(|| { + eprintln!("Usage: inspect_uc "); + std::process::exit(1); + }); + let file_name = args.next().unwrap_or_else(|| { + eprintln!("Usage: inspect_uc "); + std::process::exit(1); + }); + + let root = PathBuf::from(&root_dir); + if !root.exists() { + eprintln!("Root directory '{root_dir}' does not exist."); + std::process::exit(1); + } + + let found_path = find_file(&root, &file_name).map_or_else( + || { + eprintln!("File '{file_name}' not found under '{root_dir}'."); + std::process::exit(1); + }, + |path| path, + ); + + // Read & decode + let raw_bytes = match fs::read(&found_path) { + Ok(sources) => sources, + Err(error) => { + eprintln!("Could not read {}: {error}", found_path.display()); + std::process::exit(1); + } + }; + + let (encoding_label, _, _) = chardet::detect(&raw_bytes); + let encoding = Encoding::for_label(encoding_label.as_bytes()).unwrap_or(UTF_8); + let (decoded_str, _, _) = encoding.decode(&raw_bytes); + + let source_text = decoded_str.to_string(); + let tokenized_file = TokenizedFile::from_source(&source_text); + + tokenized_file.dump_debug_layout(); +} diff --git a/dev_tests/src/uc_lexer_verify.rs b/dev_tests/src/uc_lexer_verify.rs new file mode 100644 index 0000000..b34e8a2 --- /dev/null +++ b/dev_tests/src/uc_lexer_verify.rs @@ -0,0 +1,122 @@ +use std::{collections::HashSet, fs, path::PathBuf}; + +use rottlib::lexer::{DebugTools, TokenizedFile}; + +/// Read `ignore.txt` (one path per line, `#` for comments) from root directory +/// and turn it into a canonicalized [`HashSet`]. +fn load_ignore_set(root: &std::path::Path) -> HashSet { + let ignore_file = root.join("ignore.txt"); + if !ignore_file.exists() { + return HashSet::new(); + } + + let content = match fs::read_to_string(&ignore_file) { + Ok(content) => content, + Err(error) => { + eprintln!("Could not read {}: {error}", ignore_file.display()); + return HashSet::new(); + } + }; + + content + .lines() + .map(str::trim) + .filter(|line| !line.is_empty() && !line.starts_with('#')) + .filter_map(|line| { + let next_path = PathBuf::from(line); + let absolute_path = if next_path.is_absolute() { + next_path + } else { + root.join(next_path) + }; + fs::canonicalize(absolute_path).ok() + }) + .collect() +} + +/// CLI: `verify_uc ` - find all `.uc` files in the provided directory +/// (except those listed in `ignore.txt` in the root) and test them all. +/// +/// Reported execution time is the tokenization time, without considering time +/// it takes to read files from disk. +/// +/// `ignore.txt` is for listing specific files, not directories. +fn main() { + let root_dir = std::env::args().nth(1).unwrap(); // it is fine to crash debug utility + let root = PathBuf::from(&root_dir); + + if !root.exists() { + eprintln!("Root directory '{root_dir}' does not exist."); + std::process::exit(1); + } + + // Load files + let ignored_paths = load_ignore_set(&root); + let mut uc_files: Vec<(PathBuf, String)> = Vec::new(); + for entry in walkdir::WalkDir::new(&root) + .into_iter() + .filter_map(Result::ok) // for debug tool this is ok + .filter(|entry| { + let path = entry.path(); + // Skip anything explicitly ignored + if let Ok(absolute_path) = fs::canonicalize(path) { + if ignored_paths.contains(&absolute_path) { + return false; + } + } + // Must be *.uc + path.is_file() + && path + .extension() + .and_then(|extension| extension.to_str()) + .is_some_and(|extension| extension.eq_ignore_ascii_case("uc")) + }) + { + let path = entry.path(); + match fs::read(path) { + Ok(raw_bytes) => { + // Auto‑detect encoding for old Unreal script sources + let (encoding_label, _, _) = chardet::detect(&raw_bytes); + let encoding = encoding_rs::Encoding::for_label(encoding_label.as_bytes()) + .unwrap_or(encoding_rs::UTF_8); + let (decoded_text, _, _) = encoding.decode(&raw_bytes); + uc_files.push((path.to_path_buf(), decoded_text.into_owned())); + } + Err(error) => { + eprintln!("Failed to read `{}`: {error}", path.display()); + std::process::exit(1); + } + } + } + println!("Loaded {} .uc files into memory.", uc_files.len()); + + // Tokenize and measure performance + let start_time = std::time::Instant::now(); + let tokenized_files: Vec<(PathBuf, TokenizedFile)> = uc_files + .iter() + .map(|(path, source_code)| { + let tokenized_file = TokenizedFile::from_source(source_code); + if tokenized_file.had_errors() { + println!("TK: {}", path.display()); + } + (path.clone(), tokenized_file) + }) + .collect(); + let elapsed_time = start_time.elapsed(); + println!( + "Tokenized {} files in {:.2?}", + tokenized_files.len(), + elapsed_time + ); + + // Round‑trip check + for ((path, original), (_, tokenized_file)) in uc_files.iter().zip(tokenized_files.iter()) { + let reconstructed = tokenized_file.reconstruct_source(); + if original != &reconstructed { + eprintln!("Reconstruction mismatch in `{}`!", path.display()); + std::process::exit(1); + } + } + + println!("All .uc files matched successfully."); +} diff --git a/rottlib/Cargo.toml b/rottlib/Cargo.toml new file mode 100644 index 0000000..1d879c8 --- /dev/null +++ b/rottlib/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "rottlib" +version = "0.1.0" +edition = "2024" + +[features] +default = [] +debug = [] + +[dependencies] +logos = "0.15" \ No newline at end of file diff --git a/rottlib/src/lexer/debug_tools.rs b/rottlib/src/lexer/debug_tools.rs new file mode 100644 index 0000000..d8c84ec --- /dev/null +++ b/rottlib/src/lexer/debug_tools.rs @@ -0,0 +1,92 @@ +//! Debug-only helpers for [`TokenizedFile`] +//! +//! This module is **compiled only if** +//! +//! * the current build profile has `debug_assertions` enabled, or +//! * the crate is built with the `debug` cargo feature. +//! +//! These checks have been moved to the parent module. + +use super::Line; + +/// A technical trait that adds debug helpers to the lexer. +pub trait DebugTools { + /// Pretty-prints the internal layout of the tokenised file - useful when + /// writing new passes or hunting lexer bugs. + /// + /// This method writes the layout directly to standard output. + /// + /// The format is unspecified, may change, and is not intended for + /// external tools. + /// + /// Each line in the printed layout starts with its 0-based number for + /// convenience. + fn dump_debug_layout(&self); + + /// Reconstructs the exact, lossless source text that was fed to + /// [`super::TokenizedFile::from_source`] from internal representation - + /// useful for manually verifying that the lexer works. + fn reconstruct_source(&self) -> String; +} + +impl<'src> DebugTools for super::TokenizedFile<'src> { + fn reconstruct_source(&self) -> String { + let mut result = String::new(); + for line in &self.lines { + if let Line::Standalone(token_range) | Line::SpannedWithTokens(_, token_range) = line { + for span in &self.buffer[token_range.clone()] { + result.push_str(span.lexeme); + } + } + } + result + } + + fn dump_debug_layout(&self) { + for (row_index, line) in self.lines.iter().enumerate() { + println!("Line {}", row_index + 1); + match line { + Line::Standalone(token_range) => { + println!("\t[Standalone]"); + let mut column_utf16 = 0usize; + for next_token_span in &self.buffer[token_range.clone()] { + let token_beginning = column_utf16; + let token_end = column_utf16 + next_token_span.length_utf16; + println!( + "\t\t{:?} @ {}-{}: {:?}", + next_token_span.token, + token_beginning, + token_end, + next_token_span.lexeme + ); + column_utf16 = token_end; + } + } + Line::Spanned(origin_row) => { + // `origin_row` is 0-based + println!( + "\t[Continued from line {} - no new tokens here]", + origin_row + 1 + ); + } + Line::SpannedWithTokens(origin_row, token_range) => { + // `origin_row` is 0-based + println!("\t[Continued from line {} + new tokens]", origin_row + 1); + let mut column_utf16 = 0usize; + for next_token_span in &self.buffer[token_range.clone()] { + let token_beginning = column_utf16; + let token_end = column_utf16 + next_token_span.length_utf16; + println!( + "\t\t{:?} @ {}-{}: {:?}", + next_token_span.token, + token_beginning, + token_end, + next_token_span.lexeme + ); + column_utf16 = token_end; + } + } + } + } + } +} diff --git a/rottlib/src/lexer/lexing.rs b/rottlib/src/lexer/lexing.rs new file mode 100644 index 0000000..a55b5d9 --- /dev/null +++ b/rottlib/src/lexer/lexing.rs @@ -0,0 +1,476 @@ +//! Lexer for UnrealScript that understands inline `cpptext { ... }` blocks. +//! +//! ## Notable details +//! +//! Lexer for UnrealScript that recognises inline `cpptext { … }` blocks. +//! +//! In UnrealScript, `cpptext` lets authors embed raw C++ between braces. +//! Because whitespace, newlines, or comments may appear between the +//! `cpptext` keyword and the opening `{`, the lexer must remember that +//! it has just seen `cpptext` - hence a state machine. +//! +//! Modes +//! ------ +//! - **Normal** - ordinary UnrealScript tokens. +//! - **AwaitingCppBlock** - after `cpptext`, waiting for the next `{`. +//! +//! When that brace arrives, the lexer consumes the entire C++ block as +//! one token (`Token::Brace(BraceKind::CppBlock)`), tracking nested +//! braces, strings, and comments on the way. If the closing `}` is +//! missing, everything to EOF is treated as C++; downstream parsers must +//! handle that gracefully. + +use logos::Lexer; + +/// Which lexer mode we're in. See the module docs for the full story. +#[derive(Default, Clone, Copy, PartialEq, Eq)] +enum LexerMode { + /// Lexing regular UnrealScript. + #[default] + Normal, + /// Saw `cpptext`; waiting for the opening `{` of a C++ block. + AwaitingCppBlock, +} + +/// Extra per-lexer state. Currently just holds the [`Mode`]. +/// +/// This is a logos-specific implementation detail. +#[derive(Default)] +pub struct LexerState { + mode: LexerMode, +} + +/// Are these braces "real" UnrealScript braces, or the start/end of a C++ block? +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum BraceKind { + Normal, + CppBlock, +} + +/// All UnrealScript tokens that our compiler distinguishes. +#[derive(logos::Logos, Debug, PartialEq, Clone, Copy)] +#[logos(extras = LexerState)] +pub enum Token { + // # Compiler/directive keywords + #[regex(r"(?i)#exec[^\r\n]*(\r|\n|\r\n)")] + ExecDirective, + #[regex("(?i)cpptext", |lex| { lex.extras.mode = LexerMode::AwaitingCppBlock; })] + CppText, + + // # Declaration & structural keywords + #[regex("(?i)class")] + Class, + #[regex("(?i)struct")] + Struct, + #[regex("(?i)enum")] + Enum, + #[regex("(?i)state")] + State, + #[regex("(?i)function")] + Function, + #[regex("(?i)event")] + Event, + #[regex("(?i)delegate")] + Delegate, + #[regex("(?i)var")] + Var, + #[regex("(?i)local")] + Local, + + // # Inheritance, interface, dependencies + #[regex("(?i)extends")] + Extends, + #[regex("(?i)dependson")] + DependsOn, + + // # Access modifiers & properties + #[regex("(?i)private")] + Private, + #[regex("(?i)protected")] + Protected, + #[regex("(?i)public")] + Public, + #[regex("(?i)const")] + Const, + #[regex("(?i)static")] + Static, + #[regex("(?i)native")] + Native, + #[regex("(?i)abstract")] + Abstract, + #[regex("(?i)deprecated")] + Deprecated, + + // # UnrealScript metadata/specifiers + #[regex("(?i)default")] + Default, + #[regex("(?i)defaultproperties")] + DefaultProperties, + #[regex("(?i)optional")] + Optional, + #[regex("(?i)config")] + Config, + #[regex("(?i)perobjectconfig")] + PerObjectConfig, + #[regex("(?i)globalconfig")] + GlobalConfig, + #[regex("(?i)collapsecategories")] + CollapseCategories, + #[regex("(?i)dontcollapsecategories")] + DontCollapseCategories, + #[regex("(?i)hidecategories")] + HideCategories, + #[regex("(?i)localized")] + Localized, + #[regex("(?i)placeable")] + Placeable, + #[regex("(?i)notplaceable")] + NotPlaceable, + #[regex("(?i)editinlinenew")] + EditInlineNew, + #[regex("(?i)noteditinlinenew")] + NotEditInlineNew, + #[regex("(?i)dynamicrecompile")] + DynamicRecompile, + #[regex("(?i)transient")] + Transient, + #[regex("(?i)operator")] + Operator, + #[regex("(?i)simulated")] + Simulated, + #[regex("(?i)latent")] + Latent, + #[regex("(?i)iterator")] + Iterator, + #[regex("(?i)out")] + Out, + #[regex("(?i)skip")] + Skip, + #[regex("(?i)singular")] + Singular, + #[regex("(?i)coerce")] + Coerce, + #[regex("(?i)assert")] + Assert, + #[regex("(?i)ignores")] + Ignores, + #[regex("(?i)within")] + Within, + #[regex("(?i)noexport")] + NoExport, + + // # Replication-related + #[regex("(?i)reliable")] + Reliable, + #[regex("(?i)unreliable")] + Unreliable, + #[regex("(?i)replication")] + Replication, + #[regex("(?i)nativereplication")] + NativeReplication, + + // # Control-flow keywords + #[regex("(?i)if")] + If, + #[regex("(?i)else")] + Else, + #[regex("(?i)switch")] + Switch, + #[regex("(?i)case")] + Case, + #[regex("(?i)for")] + For, + #[regex("(?i)foreach")] + ForEach, + #[regex("(?i)while")] + While, + #[regex("(?i)do")] + Do, + #[regex("(?i)until")] + Until, + #[regex("(?i)break")] + Break, + #[regex("(?i)continue")] + Continue, + #[regex("(?i)return")] + Return, + + // # Built-in types + #[regex("(?i)int")] + Int, + #[regex("(?i)float")] + Float, + #[regex("(?i)bool")] + Bool, + #[regex("(?i)byte")] + Byte, + #[regex("(?i)string")] + String, + #[regex("(?i)array")] + Array, + #[regex("(?i)name")] + Name, + + // # Literals & identifiers + #[regex(r"0[xX][0-9A-Fa-f]+|[0-9]+")] + IntegerLiteral, + #[regex(r"[0-9]*\.[0-9]+([eE][+-]?[0-9]+)?")] + FloatLiteral, + #[regex(r#""([^"\\\r\n]|\\.)*""#)] + StringLiteral, + #[regex(r"'[a-zA-Z0-9_\. \-]*'")] + NameLiteral, + #[regex("(?i)true")] + True, + #[regex("(?i)false")] + False, + #[regex("(?i)none")] + None, + #[regex("(?i)self")] + SelfKeyword, + #[regex("(?i)new")] + New, + #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")] + Identifier, + + // # Operations + // ## Exponentiation + #[token("**")] + Exponentiation, + // ## Unary + #[token("++")] + Increment, + #[token("--")] + Decrement, + #[token("!")] + Not, + #[token("~")] + BitwiseNot, + // ## Vector + #[token("dot")] + Dot, + #[token("cross")] + Cross, + // ## Multiplicative + #[token("*")] + Multiply, + #[token("/")] + Divide, + #[token("%")] + Modulo, + // ## Additive + #[token("+")] + Plus, + #[token("-")] + Minus, + // ## String manipulation + #[token("@")] + AtChar, + #[token("$")] + DollarChar, + // ## Shifts + #[token("<<")] + LeftShift, + #[token(">>>")] + LogicalRightShift, + #[token(">>")] + RightShift, + // ## Relational + #[token("<")] + Less, + #[token("<=")] + LessEqual, + #[token(">")] + Greater, + #[token(">=")] + GreaterEqual, + #[token("==")] + Equal, + #[token("!=")] + NotEqual, + #[token("~=")] + ApproximatelyEqual, + // ## Bitwise + #[token("&")] + BitwiseAnd, + #[token("|")] + BitwiseOr, + #[token("^")] + BitwiseXor, + #[token("^^")] + BooleanXor, + // ## Logical + #[token("&&")] + And, + #[token("||")] + Or, + // ## Assigments + #[token("=")] + Assign, + #[token("*=")] + MultiplyAssign, + #[token("/=")] + DivideAssign, + #[token("+=")] + PlusAssign, + #[token("-=")] + MinusAssign, + #[token("$=")] + ConcatAssign, + #[token("@=")] + ConcatSpaceAssign, + + // # Punctuation & delimiters + #[token("(")] + LeftParen, + #[token(")")] + RightParen, + #[token("{", handle_brace)] + Brace(BraceKind), + #[token("}")] + RightBrace, + #[token("[")] + LeftBracket, + #[token("]")] + RightBracket, + #[token(";")] + Semicolon, + #[token(",")] + Comma, + #[token(".")] + Period, + #[token(":")] + Colon, + + // # Comments & whitespaces + #[regex(r"//[^\r\n]*")] + LineComment, + #[regex(r"/\*", handle_block_comment)] + BlockComment, + #[regex(r"\r\n|\n|\r")] + NewLine, + #[regex(r"[ \t]+")] + Whitespace, + + // # Technical + Error, +} + +/// Consume a /* ... */ block comment with arbitrary nesting +/// (like UnrealScript allows). +/// +/// Matches the whole comment (delimiters included) or [`None`] if the file ends +/// before every `/*` is closed. +fn handle_block_comment(lexer: &mut Lexer) -> Option<()> { + let mut comment_depth = 1; + while let Some(next_char) = lexer.remainder().chars().next() { + if lexer.remainder().starts_with("/*") { + comment_depth += 1; + lexer.bump(2); + continue; + } + if lexer.remainder().starts_with("*/") { + comment_depth -= 1; + lexer.bump(2); + if comment_depth == 0 { + return Some(()); + } + continue; + } + lexer.bump(next_char.len_utf8()); + } + // Unterminated comment + None +} + +/// Called for every `{`. +/// +/// This method either emits an opening brace or token for `cppblock`, +/// depending on lexer's current state. +fn handle_brace(lexer: &mut Lexer) -> Option { + match lexer.extras.mode { + LexerMode::Normal => Some(BraceKind::Normal), + + LexerMode::AwaitingCppBlock => { + lexer.extras.mode = LexerMode::Normal; + consume_cpp_block(lexer); + Some(BraceKind::CppBlock) + } + } +} + +/// Consumes a complete C++ block, handling: +/// - Nested `{...}` pairs +/// - String literals (`"..."` and `'...'`), including escaped quotes +/// - Line comments (`// ...\n`) +/// - Block comments (`/* ... */`) +/// +/// Leaves the lexer positioned immediately after the closing `}` of the block. +/// The opening `{` must have already been consumed by the caller. +fn consume_cpp_block(lexer: &mut Lexer) { + let mut depth = 1; + while let Some(ch) = lexer.remainder().chars().next() { + match ch { + '{' => { + depth += 1; + lexer.bump(1); + } + '}' => { + depth -= 1; + lexer.bump(1); + if depth == 0 { + break; + } + } + '/' if lexer.remainder().starts_with("/*") => { + lexer.bump(2); // consuming two-byte sequence `/*` + consume_c_comment(lexer) + } + '/' if lexer.remainder().starts_with("//") => { + lexer.bump(2); // consuming two-byte sequence `//` + while let Some(c) = lexer.remainder().chars().next() { + lexer.bump(c.len_utf8()); + if c == '\n' { + break; + } + } + } + '"' | '\'' => { + lexer.bump(1); // skip `'` or `"` + consume_string_literal(lexer, ch); + } + _ => lexer.bump(ch.len_utf8()), + } + } +} + +/// Consume over a C-style `/* … */` comment (without nesting). +/// +/// Assumes that opener `/*` is already consumed. +fn consume_c_comment(lexer: &mut Lexer) { + while let Some(next_character) = lexer.remainder().chars().next() { + if lexer.remainder().starts_with("*/") { + lexer.bump(2); + break; + } else { + lexer.bump(next_character.len_utf8()); + } + } +} + +/// Consume a string literal from C++ code. +/// +/// Assumes that opening quotation mark is already consumed. +fn consume_string_literal(lexer: &mut Lexer, delimiter: char) { + while let Some(next_character) = lexer.remainder().chars().next() { + lexer.bump(next_character.len_utf8()); + if next_character == '\\' { + // Skip the escaped character + if let Some(next) = lexer.remainder().chars().next() { + lexer.bump(next.len_utf8()); + } + } else if next_character == delimiter { + return; + } + } +} diff --git a/rottlib/src/lexer/mod.rs b/rottlib/src/lexer/mod.rs new file mode 100644 index 0000000..8fd40ef --- /dev/null +++ b/rottlib/src/lexer/mod.rs @@ -0,0 +1,276 @@ +//! # Tokenizer +//! +//! Converts raw source text into a lossless, position-aware stream of lexical +//! [`Token`]s, grouped *per physical line*, and returns it as +//! a [`TokenizedFile`]. +//! +//! Design goals: +//! +//! 1. **Lossless**: preserving complete information for each token, enough to +//! recreate the original bytes without loss. +//! 2. **LSP readiness**: the LSP heavily relies on UTF-16 support, so we +//! precompute lengths of each token in that encoding, making interfacing +//! easier. +//! +//! ## Opt-in debug helpers +//! +//! Extra diagnostics become available in **debug builds** or when the crate is +//! compiled with `debug` feature enabled. They live in the [`DebugTools`] +//! extension trait, implemented for [`TokenizedFile`]. +//! +//! ``` +//! // bring the trait into scope +//! use lexer::DebugTools; +//! +//! let file = TokenizedFile::from_source(src); +//! file.debug_dump(); // pretty-print token layout +//! let text = file.to_source(); // reconstruct original text +//! ``` + +mod debug_tools; +mod lexing; + +use std::ops::Range; + +use logos::Logos; + +#[cfg(any(debug_assertions, feature = "debug"))] +pub use debug_tools::DebugTools; +pub use lexing::Token; + +/// Empirically chosen starting size for token buffer (used during tokenization) +/// that provides good performance. +const DEFAULT_TOKEN_BUFFER_CAPACITY: usize = 20_000; + +/// A slice tagged with its token kind plus two length counters. +/// +/// *No absolute coordinates* are stored - they are recomputed per line. +#[derive(Debug, Clone, Copy)] +struct TokenSpan<'src> { + lexeme: &'src str, + token: Token, + length_utf16: usize, +} + +/// Representation of a single physical line of the source file. +/// +/// [`Range`] are used instead of slices to avoid creating +/// a self-referential struct (with [`TokenizedFile`]), which rust forbids. +#[derive(Clone)] +enum Line { + /// A standalone line that owns a contiguous slice in + /// the [`TokenizedFile::buffer`] arena. + Standalone(Range), + /// A 0-based line that is part of a multi-line token started on + /// another line. + Spanned(usize), + /// A 0-based line that is part of a multi-line token started on + /// another line *and* contains additional tokens local to itself. + SpannedWithTokens(usize, Range), +} + +/// A tokenized, lossless representation of an UnrealScript source file. +pub struct TokenizedFile<'src> { + /// Arena of every token span in this file. + buffer: Vec>, + /// Mapping that provides an easy and efficient access to tokens by + /// line number. + lines: Vec, + /// Simple flag for marking erroneous state. + had_errors: bool, +} + +/// Mutable state that encapsulates data needed during the tokenization loop. +struct Tokenizer<'src> { + /// Arena that owns every [`TokenSpan`] produced for the file. + buffer: Vec>, + /// Mapping from physical line number to the tokens that belong to it. + lines: Vec, + /// The current 0-based physical line number. + line_number: usize, + /// Index in [`Tokenizer::buffer`] where the current line starts. + slice_start_index: usize, + /// When a multi-line token is being scanned, stores the 0-based line + /// on which it started; [`None`] otherwise. + multi_line_start: Option, + /// Set to [`true`] if the lexer reported any error tokens. + had_errors: bool, +} + +impl<'src> TokenizedFile<'src> { + /// Tokenize `source` and return a fresh [`TokenizedFile`]. + pub fn from_source(source: &'src str) -> TokenizedFile<'src> { + let mut tokenizer = TokenizedFile::<'src>::builder(); + let mut lexer = Token::lexer(source); + + // Logos > Ok() > token > token span <- plugged into tokenizer + while let Some(token_result) = lexer.next() { + let token = token_result.unwrap_or_else(|_| { + tokenizer.had_errors = true; + Token::Error + }); + let token_span = build_span(token, lexer.slice()); + tokenizer.process_token_span(token_span); + } + tokenizer.into_tokenized_file() + } + + /// Returns [`true`] if any erroneous tokens were produced during building + /// of this [`TokenizedFile`]. + pub fn had_errors(&self) -> bool { + self.had_errors + } + + /// Create an empty tokenizer state with tuned buffer capacity. + fn builder() -> Tokenizer<'src> { + Tokenizer { + buffer: Vec::with_capacity(DEFAULT_TOKEN_BUFFER_CAPACITY), + lines: Vec::new(), + line_number: 0, + slice_start_index: 0, + multi_line_start: None, + had_errors: false, + } + } +} + +impl<'src> Tokenizer<'src> { + /// Handles a token span and dispatches to the appropriate handler. + fn process_token_span(&mut self, token_span: TokenSpan<'src>) { + if token_can_span_lines(&token_span.token) { + self.process_multi_line_token(token_span); + } else { + self.process_single_line_token(token_span); + } + } + + /// Handles tokens that never span multiple lines. + fn process_single_line_token(&mut self, token_span: TokenSpan<'src>) { + if token_is_newline(&token_span.token) { + self.line_number += 1; + self.buffer.push(token_span); + self.commit_current_line(); + } else { + self.buffer.push(token_span); + } + } + + /// Handles tokens that may contain one or more newline characters. + fn process_multi_line_token(&mut self, token_span: TokenSpan<'src>) { + let start_line = self.line_number; + let newline_count = count_newlines(token_span.lexeme); + + // Did this token end in a newline? + // This can happen if this is an `Error` token that ends the file. + let ends_with_newline = + token_span.lexeme.ends_with('\n') || token_span.lexeme.ends_with('\r'); + + self.buffer.push(token_span); + // We only need to commit the line if this token actually ended the line + if newline_count > 0 { + self.commit_current_line(); + // We only need to insert one `Line::Spanned(base)` per *interior* + // newline, so `newline_count - 1` such lines + // (e.g. 2 line breaks in block comment -> it has + // exactly `1` interior line) + let insert_count = newline_count - 1; + for _ in 0..insert_count { + self.lines.push(Line::Spanned(start_line)); + } + // This is called *after* `commit_current_line()` cleared previous + // stored value + self.multi_line_start = if ends_with_newline { + None // we're done at this point + } else { + Some(start_line) + }; + } + + self.line_number = start_line + newline_count; + } + + /// Commits the tokens of the current physical line into `self.lines`. + fn commit_current_line(&mut self) { + let slice_end = self.buffer.len(); + if slice_end > self.slice_start_index { + let slice = self.slice_start_index..slice_end; + + // If we were in the middle of a multi-line token, we + // *always* consume `spanned_from` here, ensuring that each call to + // `commit_current_line()` only applies it once. + // This guarantees no "bleed" between adjacent multi-line tokens. + if let Some(from) = self.multi_line_start.take() { + self.lines.push(Line::SpannedWithTokens(from, slice)); + } else { + self.lines.push(Line::Standalone(slice)); + } + self.slice_start_index = slice_end; + } + } + + /// Finishes tokenization, converting accumulated data into + /// [`TokenizedFile`]. + fn into_tokenized_file(mut self) -> TokenizedFile<'src> { + // Commit any trailing tokens + self.commit_current_line(); + // If we still have a `spanned_from` (i.e. a pure multi-line token with + // no local tokens on its last line), push a bare `Spanned` entry. + if let Some(from) = self.multi_line_start.take() { + self.lines.push(Line::Spanned(from)); + } + + // Optimize for size + self.buffer.shrink_to_fit(); + self.lines.shrink_to_fit(); + + TokenizedFile { + buffer: self.buffer, + lines: self.lines, + had_errors: self.had_errors, + } + } +} + +fn build_span<'src>(token: Token, text: &'src str) -> TokenSpan<'src> { + let length_utf16 = text.encode_utf16().count(); + TokenSpan { + lexeme: text, + token, + length_utf16, + } +} + +fn token_is_newline(token: &Token) -> bool { + matches!(token, Token::NewLine) +} + +fn token_can_span_lines(token: &Token) -> bool { + matches!( + token, + Token::BlockComment | Token::Brace(lexing::BraceKind::CppBlock) | Token::Error + ) +} + +/// Counts the number of new lines in given text. +fn count_newlines(text: &str) -> usize { + let mut bytes_iterator = text.as_bytes().iter().peekable(); + let mut newline_count = 0; + while let Some(&next_byte) = bytes_iterator.next() { + // Logos' regex rule is "\r\n|\n|\r", so we agree with it on new line + // character treatment + match next_byte { + b'\r' => { + newline_count += 1; + if let Some(&&b'\n') = bytes_iterator.peek() { + // skip the '\n' in a CRLF + bytes_iterator.next(); + } + } + b'\n' => { + newline_count += 1; + } + _ => (), + } + } + newline_count +} diff --git a/rottlib/src/lib.rs b/rottlib/src/lib.rs new file mode 100644 index 0000000..2f4509e --- /dev/null +++ b/rottlib/src/lib.rs @@ -0,0 +1,3 @@ +#![allow(clippy::doc_overindented_list_items)] + +pub mod lexer; diff --git a/rottlsp/Cargo.toml b/rottlsp/Cargo.toml new file mode 100644 index 0000000..5d731d0 --- /dev/null +++ b/rottlsp/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "rottlsp" +version = "0.1.0" +edition = "2024" + +[dependencies] +rottlib = { version = "0", path = "../rottlib" } +tokio = { version = "1", features = ["full"] } +tower-lsp = "0.20" + +[lints] +workspace = true \ No newline at end of file diff --git a/rottlsp/src/main.rs b/rottlsp/src/main.rs new file mode 100644 index 0000000..d88dfc1 --- /dev/null +++ b/rottlsp/src/main.rs @@ -0,0 +1,84 @@ +use tower_lsp::lsp_types; + +/// A Language Server implementation for Rott. +/// +/// Implements the [`tower_lsp::LanguageServer`] trait to handle LSP requests +/// (e.g. initialization, text synchronization, open notifications) +/// asynchronously. +struct RottLanguageServer { + /// Client handle for sending notifications and requests to the editor. + client: tower_lsp::Client, +} + +#[tower_lsp::async_trait] +impl tower_lsp::LanguageServer for RottLanguageServer { + // Inform the client of our server capabilities during initialization. + async fn initialize( + &self, + _: lsp_types::InitializeParams, + ) -> tower_lsp::jsonrpc::Result { + Ok(lsp_types::InitializeResult { + capabilities: lsp_types::ServerCapabilities { + // We can synchronize the text of files, which means we request + // to receive full updates whenever a file is opened or changed. + // `lsp_types::TextDocumentSyncKind::FULL` means we require full text + // every time. + text_document_sync: Some(lsp_types::TextDocumentSyncCapability::Kind( + lsp_types::TextDocumentSyncKind::FULL, + )), + ..Default::default() + }, + ..Default::default() + }) + } + + // On file open, tokenize the new document and log any lexing errors. + async fn did_open(&self, params: lsp_types::DidOpenTextDocumentParams) { + // Measure lexing performance to track parser responsiveness. + let start_time = std::time::Instant::now(); + let has_errors = + rottlib::lexer::TokenizedFile::from_source(¶ms.text_document.text).had_errors(); + let elapsed_time = start_time.elapsed(); + + self.client + .log_message( + lsp_types::MessageType::INFO, + format!( + "Tokenized {} in {:?}", + params.text_document.uri.path(), + elapsed_time + ), + ) + .await; + + if has_errors { + self.client + .log_message( + lsp_types::MessageType::INFO, + format!( + "There was an error while tokenizing {}", + params.text_document.uri.path(), + ), + ) + .await; + } + } + + // Handle shutdown signal. + async fn shutdown(&self) -> tower_lsp::jsonrpc::Result<()> { + // No cleanup required on shutdown; simply acknowledge the request. + Ok(()) + } +} + +#[tokio::main] +async fn main() { + // We are using standard input and output for communicating with an editor, + // so we need to avoid methods or macros that write or read using them, + // e.g. `println!`. + let (stdin, stdout) = (tokio::io::stdin(), tokio::io::stdout()); + let (service, socket) = tower_lsp::LspService::new(|client| RottLanguageServer { client }); + tower_lsp::Server::new(stdin, stdout, socket) + .serve(service) + .await; +}