504 lines
17 KiB
Rust
504 lines
17 KiB
Rust
use std::{ops::Range, path::Path};
|
|
|
|
#[derive(Copy, Clone, Debug, PartialEq)]
|
|
pub enum TokenKind {
|
|
/// Non-structural whitespace
|
|
Whitespace,
|
|
/// Identifiers and names
|
|
Ident,
|
|
/// Syntax keywords
|
|
Keyword,
|
|
/// Numeric literals
|
|
Number,
|
|
/// Types or type definitions
|
|
Type,
|
|
/// Comments, which have no effect on the code
|
|
Comment,
|
|
/// Documentation or doc comments
|
|
Doc,
|
|
/// Operators that perform work on operands
|
|
Operator,
|
|
/// Structural tokens (parentheses, braces, brackets, etc.)
|
|
Delimiter,
|
|
/// A field or method of another value (i.e: a named thing not present in the current namespace)
|
|
Property,
|
|
/// A special attribute or decorator attached to some other code
|
|
Attribute,
|
|
/// A macro, that transforms the code in some manner
|
|
Macro,
|
|
/// A string literal
|
|
String,
|
|
/// Misc special syntax (defined per-language)
|
|
Special,
|
|
/// A program constant or other statically-known name
|
|
Constant,
|
|
}
|
|
|
|
pub struct Highlighter {
|
|
// regex: meta::Regex,
|
|
matchers: Vec<Regex>,
|
|
entries: Vec<TokenKind>,
|
|
}
|
|
|
|
impl Highlighter {
|
|
pub fn new_from_regex<P: AsRef<str>>(
|
|
patterns: impl IntoIterator<Item = (TokenKind, P)>,
|
|
) -> Self {
|
|
let (entries, patterns): (_, Vec<_>) = patterns.into_iter().unzip();
|
|
|
|
let matchers = patterns
|
|
.iter()
|
|
.map(|p| Regex::parser().parse(p.as_ref()).unwrap())
|
|
.collect();
|
|
|
|
Self { entries, matchers }
|
|
}
|
|
|
|
pub fn with(mut self, token: TokenKind, p: impl AsRef<str>) -> Self {
|
|
self.entries.push(token);
|
|
self.matchers
|
|
.push(Regex::parser().parse(p.as_ref()).unwrap());
|
|
self
|
|
}
|
|
|
|
pub fn from_file_name(file_name: &Path) -> Option<Self> {
|
|
match file_name.extension()?.to_str()? {
|
|
"rs" => Some(Self::rust()),
|
|
"md" => Some(Self::markdown()),
|
|
"toml" => Some(Self::toml()),
|
|
"c" | "h" | "cpp" | "hpp" | "cxx" | "js" | "ts" | "go" => Some(Self::generic_clike()),
|
|
"glsl" | "vert" | "frag" => Some(Self::glsl()),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
pub fn markdown() -> Self {
|
|
Self::new_from_regex([
|
|
// Links
|
|
(TokenKind::String, r"\[[^\]]*\](\([^\)]*\))?"),
|
|
// Header
|
|
(TokenKind::Doc, r"^#+[[:space:]][^$]*$"),
|
|
// List item
|
|
(TokenKind::Operator, r"^[[:space:]]?[\-([0-9]+[\)\.])]"),
|
|
// Bold
|
|
(TokenKind::Property, r"\*\*[^(\*\*)]*\*\*"),
|
|
// Italics
|
|
(TokenKind::Attribute, r"\*[^\*]*\*"),
|
|
// Code block
|
|
(TokenKind::Operator, r"^```[^(^```)]*^```"),
|
|
// Inline code
|
|
(TokenKind::Constant, r"`[^`$]*[`$]"),
|
|
// HTML
|
|
(TokenKind::Special, r"<[^<>]*>"),
|
|
])
|
|
}
|
|
|
|
pub fn rust() -> Self {
|
|
Self::new_from_regex([
|
|
// Both kinds of comments match multiple lines
|
|
(
|
|
TokenKind::Doc,
|
|
r"\/\/[\/!][^\n]*$(\n[[:space:]]\/\/[\/!][^\n]*$)*",
|
|
),
|
|
(TokenKind::Comment, r"\/\/[^$]*$(\n[[:space:]]\/\/[^$]*$)*"),
|
|
// Multi-line comment
|
|
(TokenKind::Comment, r"\/\*[^(\*\/)]*\*\/"),
|
|
(
|
|
TokenKind::Keyword,
|
|
r"\b[(pub)(enum)(let)(self)(Self)(fn)(impl)(struct)(use)(if)(while)(for)(in)(loop)(mod)(match)(else)(break)(continue)(trait)(const)(static)(type)(mut)(as)(crate)(extern)(move)(ref)(return)(super)(unsafe)(use)(where)(async)(dyn)(try)(gen)(macro_rules)(union)(raw)]\b",
|
|
),
|
|
(TokenKind::Constant, r"\b[(true)(false)]\b"),
|
|
// Flow-control operators count as keywords
|
|
(TokenKind::Keyword, r"\.await\b"),
|
|
// Macro invocations: println!
|
|
(TokenKind::Macro, r"\b[A-Za-z_][A-Za-z0-9_]*!"),
|
|
// Meta-variables
|
|
(TokenKind::Macro, r"\$[A-Za-z_][A-Za-z0-9_]*\b"),
|
|
(TokenKind::Constant, r"\b[A-Z][A-Z0-9_]+\b"),
|
|
(TokenKind::Type, r"\b[A-Z][A-Za-z0-9_]*\b"),
|
|
// Primitives
|
|
(
|
|
TokenKind::Type,
|
|
r"\b[(u8)(u16)(u32)(u64)(u128)(i8)(i16)(i32)(i64)(i128)(usize)(isize)(bool)(str)(char)(f16)(f32)(f64)(f128)]\b",
|
|
),
|
|
// "foo" or b"foo" or r#"foo"#
|
|
(TokenKind::String, r#"b?r?(#*)@("[(\\")[^("~)]]*("~))"#),
|
|
// Characters
|
|
(
|
|
TokenKind::String,
|
|
r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]'"#,
|
|
),
|
|
(
|
|
TokenKind::Operator,
|
|
r"[(&(mut)?)(\?)(\+=?)(\-=?)(\*=?)(\/=?)(%=?)(!=?)(==?)(&&?=?)(\|\|?=?)(<<?=?)(>>?=?)(\.\.[\.=]?)\\\~\^:;,\@(=>?)]",
|
|
),
|
|
// Fields and methods: a.foo
|
|
(TokenKind::Property, r"\.[a-z_][A-Za-z0-9_]*"),
|
|
// Paths: std::foo::bar
|
|
(TokenKind::Property, r"[A-Za-z_][A-Za-z0-9_]*::"),
|
|
// Lifetimes
|
|
(TokenKind::Special, r"'[a-z_][A-Za-z0-9_]*\b"),
|
|
(TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_]*\b"),
|
|
(TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*"),
|
|
(TokenKind::Delimiter, r"[\{\}\(\)\[\]]"),
|
|
(TokenKind::Macro, r"[\{\}\(\)\[\]]"),
|
|
(TokenKind::Attribute, r"#!?\[[^\]]*\]"),
|
|
])
|
|
}
|
|
|
|
pub fn clike(keyword: &str, r#type: &str, builtin: &str) -> Self {
|
|
Self::new_from_regex([
|
|
// Both kinds of comments match multiple lines
|
|
(
|
|
TokenKind::Doc,
|
|
r"\/\/[\/!][^\n]*$(\n[[:space:]]\/\/[\/!][^\n]*$)*",
|
|
),
|
|
(TokenKind::Comment, r"\/\/[^$]*$(\n[[:space:]]\/\/[^$]*$)*"),
|
|
// Multi-line comment
|
|
(TokenKind::Comment, r"\/\*[^(\*\/)]*\*\/"),
|
|
(TokenKind::Keyword, keyword),
|
|
(TokenKind::Macro, builtin),
|
|
(TokenKind::Constant, r"\b[(true)(false)]\b"),
|
|
// Flow-control operators count as keywords
|
|
(TokenKind::Keyword, r"\b[(\.await)\?]\b"),
|
|
(TokenKind::Constant, r"\b[A-Z][A-Z0-9_]+\b"),
|
|
(TokenKind::Type, r"\b[A-Z][A-Za-z0-9_]*\b"),
|
|
// Primitives
|
|
(TokenKind::Type, r#type),
|
|
// "foo" or b"foo" or r#"foo"#
|
|
(TokenKind::String, r#"b?r?(#*)@("[(\\")[^("~)]]*("~))"#),
|
|
// Character strings
|
|
(
|
|
TokenKind::String,
|
|
r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]*'"#,
|
|
),
|
|
(
|
|
TokenKind::Operator,
|
|
r"[(&)(\?)(\+\+)(\-\-)(\+=?)(\-=?)(\*=?)(\/=?)(%=?)(!=?)(==?)(&&?=?)(\|\|?=?)(<<?=?)(>>?=?)(\.\.[\.=]?)\\\~\^:;,\@(=>?)]",
|
|
),
|
|
// Fields and methods: a.foo
|
|
(TokenKind::Property, r"\.[a-z_][A-Za-z0-9_]*"),
|
|
// Paths: std::foo::bar
|
|
(TokenKind::Property, r"[A-Za-z_][A-Za-z0-9_]*::"),
|
|
(TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_]*\b"),
|
|
(TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*"),
|
|
(TokenKind::Delimiter, r"[\{\}\(\)\[\]]"),
|
|
// Preprocessor
|
|
(TokenKind::Macro, r"^#[^$]*$"),
|
|
])
|
|
}
|
|
|
|
pub fn generic_clike() -> Self {
|
|
Self::clike(
|
|
// keyword
|
|
r"\b[(var)(enum)(let)(this)(fn)(struct)(class)(import)(if)(while)(for)(in)(loop)(else)(break)(continue)(const)(static)(type)(extern)(return)(async)(throw)(catch)(union)(auto)(namespace)(public)(private)(function)(func)]\b",
|
|
// types
|
|
r"\b[(([(unsigned)(signed)][[:space:]])*u?int[0-9]*(_t)?)(float)(double)(bool)(char)(size_t)(void)]\b",
|
|
"[]",
|
|
)
|
|
}
|
|
|
|
pub fn glsl() -> Self {
|
|
Self::clike(
|
|
// keyword
|
|
r"\b[(struct)(if)(while)(for)(else)(break)(continue)(const)(return)(layout)(uniform)(set)(binding)(location)(in)]\b",
|
|
// types
|
|
r"\b[(u?int)(float)(double)(bool)(void)([ui]?vec[1-4]*)([ui]?mat[1-4]*)(texture[(2D)(3D)]?(Cube)?)([ui]?sampler[(2D)(3D)]?(Shadow)?)]\b",
|
|
// Builtins
|
|
r"\b[(dot)(cross)(textureSize)(normalize)(texelFetch)(textureProj)(max)(min)(clamp)(reflect)(mix)(distance)(length)(abs)(pow)(sign)(sin)(cos)(tan)(fract)(mod)(round)(step)]\b",
|
|
)
|
|
}
|
|
|
|
pub fn toml() -> Self {
|
|
Self::new_from_regex([
|
|
// Header
|
|
(TokenKind::Doc, r#"^\[[^\n\]]*\]$"#),
|
|
// Delimiters
|
|
(TokenKind::Delimiter, r"[\{\}\(\)\[\]]"),
|
|
// Operators
|
|
(TokenKind::Operator, r"[=,]"),
|
|
// Numbers
|
|
(TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*"),
|
|
// Double-quoted strings
|
|
(
|
|
TokenKind::String,
|
|
r#"b?"[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^"]]*""#,
|
|
),
|
|
// Single-quoted strings
|
|
(
|
|
TokenKind::String,
|
|
r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]*'"#,
|
|
),
|
|
// Booleans
|
|
(TokenKind::Constant, r"\b[(true)(false)]\b"),
|
|
// Identifier
|
|
(TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_\-]*\b"),
|
|
// Comments
|
|
(TokenKind::Comment, r"#[^$]*$"),
|
|
])
|
|
}
|
|
|
|
fn highlight_str(&self, mut s: &[char]) -> Vec<Token> {
|
|
let mut tokens = Vec::new();
|
|
let mut i = 0;
|
|
loop {
|
|
let n = if let Some((idx, n)) = self
|
|
.matchers
|
|
.iter()
|
|
.enumerate()
|
|
.find_map(|(i, r)| Some((i, r.matches(s)?)))
|
|
{
|
|
tokens.push(Token {
|
|
kind: self.entries[idx],
|
|
range: i..i + n,
|
|
});
|
|
n
|
|
} else if !s.is_empty() {
|
|
1
|
|
} else {
|
|
break;
|
|
};
|
|
i += n;
|
|
s = &s[n..];
|
|
}
|
|
tokens
|
|
}
|
|
|
|
pub fn highlight(self, s: &[char]) -> Highlights {
|
|
let tokens = self.highlight_str(s);
|
|
Highlights {
|
|
highlighter: self,
|
|
tokens,
|
|
}
|
|
}
|
|
}
|
|
|
|
pub struct Highlights {
|
|
pub highlighter: Highlighter,
|
|
tokens: Vec<Token>,
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
pub struct Token {
|
|
pub kind: TokenKind,
|
|
pub range: Range<usize>,
|
|
}
|
|
|
|
impl Highlights {
|
|
pub fn insert(&mut self, at: usize, s: &str) {}
|
|
|
|
pub fn get_at(&self, pos: usize) -> Option<&Token> {
|
|
let idx = self.tokens
|
|
.binary_search_by_key(&pos, |tok| tok.range.start)
|
|
// .ok()?
|
|
.unwrap_or_else(|p| p.saturating_sub(1))
|
|
// .saturating_sub(1)
|
|
;
|
|
let tok = self.tokens.get(idx)?;
|
|
if tok.range.contains(&pos) {
|
|
Some(tok)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub enum Regex {
|
|
Whitespace,
|
|
WordBoundary,
|
|
LineStart,
|
|
LineEnd,
|
|
LastDelim,
|
|
Range(char, char),
|
|
Char(char),
|
|
Set(Vec<Self>),
|
|
NegSet(Vec<Self>),
|
|
Group(Vec<Self>),
|
|
// (at_least, at_most, _)
|
|
Many(usize, usize, Box<Self>),
|
|
// (delimiter, x) - delimit x with `delimiter` on either side (used for raw strings)
|
|
Delim(Box<Self>, Box<Self>),
|
|
}
|
|
|
|
struct State<'a> {
|
|
s: &'a [char],
|
|
pos: usize,
|
|
delim: Option<&'a [char]>,
|
|
}
|
|
|
|
impl State<'_> {
|
|
fn peek(&self) -> Option<char> {
|
|
self.s.get(self.pos).copied()
|
|
}
|
|
|
|
fn prev(&self) -> Option<char> {
|
|
self.s[..self.pos].last().copied()
|
|
// self.s.get(self.pos.saturating_sub(1)).copied()
|
|
}
|
|
|
|
fn skip_if(&mut self, f: impl FnOnce(char) -> bool) -> Option<()> {
|
|
self.peek().filter(|c| f(*c))?;
|
|
self.pos += 1;
|
|
Some(())
|
|
}
|
|
|
|
fn attempt(&mut self, r: &Regex) -> Option<()> {
|
|
let old_pos = self.pos;
|
|
if self.go(r).is_some() {
|
|
Some(())
|
|
} else {
|
|
self.pos = old_pos;
|
|
None
|
|
}
|
|
}
|
|
|
|
fn go(&mut self, r: &Regex) -> Option<()> {
|
|
match r {
|
|
Regex::WordBoundary => {
|
|
let is_word = |c: char| c.is_alphanumeric() || c == '_';
|
|
(is_word(self.prev().unwrap_or(' ')) != is_word(self.peek().unwrap_or(' ')))
|
|
.then_some(())
|
|
}
|
|
Regex::LineStart => self.prev().map_or(true, |c| c == '\n').then_some(()),
|
|
Regex::LineEnd => self.peek().map_or(true, |c| c == '\n').then_some(()),
|
|
Regex::LastDelim => {
|
|
if self.s[self.pos..].starts_with(self.delim?) {
|
|
self.pos += self.delim.unwrap().len();
|
|
Some(())
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
Regex::Char(x) => self.skip_if(|c| c == *x),
|
|
Regex::Whitespace => {
|
|
let mut once = false;
|
|
while self.skip_if(|c| c.is_ascii_whitespace()).is_some() {
|
|
once = true;
|
|
}
|
|
once.then_some(())
|
|
}
|
|
Regex::NegSet(xs) => {
|
|
if xs.iter().all(|x| self.attempt(x).is_none()) {
|
|
self.skip_if(|_| true)?;
|
|
Some(())
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
Regex::Set(xs) => xs.iter().find_map(|x| self.attempt(x)),
|
|
Regex::Group(xs) => {
|
|
for x in xs {
|
|
self.go(x)?;
|
|
}
|
|
Some(())
|
|
}
|
|
Regex::Range(a, b) => self.skip_if(|c| (a..=b).contains(&&c)),
|
|
Regex::Many(at_least, at_most, x) => {
|
|
let mut times = 0;
|
|
loop {
|
|
let pos = self.pos;
|
|
if times >= *at_most || self.attempt(x).is_none() {
|
|
break (times >= *at_least).then_some(());
|
|
}
|
|
assert_ne!(pos, self.pos, "{x:?}");
|
|
times += 1;
|
|
}
|
|
}
|
|
Regex::Delim(d, r) => {
|
|
let old_pos = self.pos;
|
|
self.go(d)?;
|
|
let old_delim = self.delim.replace(&self.s[old_pos..self.pos]);
|
|
let res = self.go(r);
|
|
self.delim = old_delim;
|
|
res
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Regex {
|
|
fn matches(&self, s: &[char]) -> Option<usize> {
|
|
let mut s = State {
|
|
s,
|
|
pos: 0,
|
|
delim: None,
|
|
};
|
|
s.go(self).map(|_| s.pos)
|
|
}
|
|
}
|
|
|
|
use chumsky::{
|
|
pratt::{infix, left, postfix},
|
|
prelude::*,
|
|
};
|
|
|
|
impl Regex {
|
|
fn parser<'a>() -> impl Parser<'a, &'a str, Self, extra::Err<Rich<'a, char>>> {
|
|
recursive(|regex| {
|
|
let metachars = r"{}[]()^$.|*+-?\/@~";
|
|
let char_ = choice((
|
|
none_of(metachars),
|
|
// Escaped meta characters
|
|
just('\\').ignore_then(one_of(metachars)),
|
|
just("\\n").to('\n'),
|
|
));
|
|
|
|
let range = char_
|
|
.then_ignore(just('-'))
|
|
.then(char_)
|
|
.map(|(a, b)| Self::Range(a, b));
|
|
|
|
let items = regex.clone().repeated().collect();
|
|
|
|
let atom = choice((
|
|
range,
|
|
char_.map(Self::Char),
|
|
just("\\b").to(Self::WordBoundary),
|
|
just("^").to(Self::LineStart),
|
|
just("$").to(Self::LineEnd),
|
|
just("~").to(Self::LastDelim),
|
|
// Classes
|
|
just("[[:space:]]").map(|_| Self::Whitespace),
|
|
items
|
|
.clone()
|
|
.delimited_by(just("[^"), just(']'))
|
|
.map(Regex::NegSet),
|
|
items
|
|
.clone()
|
|
.delimited_by(just('['), just(']'))
|
|
.map(Regex::Set),
|
|
items
|
|
.clone()
|
|
.delimited_by(just('('), just(')'))
|
|
.map(Regex::Group),
|
|
));
|
|
|
|
atom.pratt((
|
|
postfix(1, just('*'), |r, _, _| Self::Many(0, !0, Box::new(r))),
|
|
postfix(1, just('+'), |r, _, _| Self::Many(1, !0, Box::new(r))),
|
|
postfix(1, just('?'), |r, _, _| Self::Many(0, 1, Box::new(r))),
|
|
// Non-standard: `x@y` parses `x` and then `y`. `y` can use `~` to refer to the extra string that was
|
|
// parsed by `x`. This supports nesting and is intended for context-sensitive patterns like Rust raw
|
|
// strings.
|
|
infix(left(0), just('@'), |d, _, r, _| {
|
|
Self::Delim(Box::new(d), Box::new(r))
|
|
}),
|
|
))
|
|
})
|
|
.repeated()
|
|
.collect()
|
|
.map(Self::Group)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn simple() {
|
|
let hl = Highlighter::rust().highlight("pub");
|
|
assert_eq!(hl.tokens, Vec::new());
|
|
}
|
|
}
|