zte/src/highlight/mod.rs

use std::{ops::Range, path::Path};

#[derive(Copy, Clone, Debug, PartialEq)]
pub enum TokenKind {
    /// Non-structural whitespace
    Whitespace,
    /// Identifiers and names
    Ident,
    /// Syntax keywords
    Keyword,
    /// Numeric literals
    Number,
    /// Types or type definitions
    Type,
    /// Comments, which have no effect on the code
    Comment,
    /// Documentation or doc comments
    Doc,
    /// Operators that perform work on operands
    Operator,
    /// Structural tokens (parentheses, braces, brackets, etc.)
    Delimiter,
    /// A field or method of another value (i.e: a named thing not present in the current namespace)
    Property,
    /// A special attribute or decorator attached to some other code
    Attribute,
    /// A macro, that transforms the code in some manner
    Macro,
    /// A string literal
    String,
    /// Misc special syntax (defined per-language)
    Special,
    /// A program constant or other statically-known name
    Constant,
}

pub struct Highlighter {
    // regex: meta::Regex,
    matchers: Vec<Regex>,
    entries: Vec<TokenKind>,
}

impl Highlighter {
    pub fn new_from_regex<P: AsRef<str>>(
        patterns: impl IntoIterator<Item = (TokenKind, P)>,
    ) -> Self {
        let (entries, patterns): (_, Vec<_>) = patterns.into_iter().unzip();

        let matchers = patterns
            .iter()
            .map(|p| Regex::parser().parse(p.as_ref()).unwrap())
            .collect();

        Self { entries, matchers }
    }

    pub fn with(mut self, token: TokenKind, p: impl AsRef<str>) -> Self {
        self.entries.push(token);
        self.matchers
            .push(Regex::parser().parse(p.as_ref()).unwrap());
        self
    }

    pub fn from_file_name(file_name: &Path) -> Option<Self> {
        match file_name.extension()?.to_str()? {
            "rs" => Some(Self::rust()),
            "md" => Some(Self::markdown()),
            "toml" => Some(Self::toml()),
            "c" | "h" | "cpp" | "hpp" | "cxx" | "js" | "ts" | "go" => Some(Self::generic_clike()),
            "glsl" | "vert" | "frag" => Some(Self::glsl()),
            _ => None,
        }
    }

    pub fn markdown() -> Self {
        Self::new_from_regex([
            // Links
            (TokenKind::String, r"\[[^\]]*\](\([^\)]*\))?"),
            // Header
            (TokenKind::Doc, r"^#+[[:space:]][^$]*$"),
            // List item
            (TokenKind::Operator, r"^[[:space:]]?[\-([0-9]+[\)\.])]"),
            // Bold
            (TokenKind::Property, r"\*\*[^(\*\*)]*\*\*"),
            // Italics
            (TokenKind::Attribute, r"\*[^\*]*\*"),
            // Code block
            (TokenKind::Operator, r"^```[^(^```)]*^```"),
            // Inline code
            (TokenKind::Constant, r"`[^`$]*[`$]"),
            // HTML
            (TokenKind::Special, r"<[^<>]*>"),
        ])
    }

    pub fn rust() -> Self {
        Self::new_from_regex([
            // Both kinds of comments match multiple lines
            (
                TokenKind::Doc,
                r"\/\/[\/!][^\n]*$(\n[[:space:]]\/\/[\/!][^\n]*$)*",
            ),
            (TokenKind::Comment, r"\/\/[^$]*$(\n[[:space:]]\/\/[^$]*$)*"),
            // Multi-line comment
            (TokenKind::Comment, r"\/\*[^(\*\/)]*\*\/"),
            (
                TokenKind::Keyword,
                r"\b[(pub)(enum)(let)(self)(Self)(fn)(impl)(struct)(use)(if)(while)(for)(in)(loop)(mod)(match)(else)(break)(continue)(trait)(const)(static)(type)(mut)(as)(crate)(extern)(move)(ref)(return)(super)(unsafe)(use)(where)(async)(dyn)(try)(gen)(macro_rules)(union)(raw)]\b",
            ),
            (TokenKind::Constant, r"\b[(true)(false)]\b"),
            // Flow-control operators count as keywords
            (TokenKind::Keyword, r"\.await\b"),
            // Macro invocations: println!
            (TokenKind::Macro, r"\b[A-Za-z_][A-Za-z0-9_]*!"),
            // Meta-variables
            (TokenKind::Macro, r"\$[A-Za-z_][A-Za-z0-9_]*\b"),
            (TokenKind::Constant, r"\b[A-Z][A-Z0-9_]+\b"),
            (TokenKind::Type, r"\b[A-Z][A-Za-z0-9_]*\b"),
            // Primitives
            (
                TokenKind::Type,
                r"\b[(u8)(u16)(u32)(u64)(u128)(i8)(i16)(i32)(i64)(i128)(usize)(isize)(bool)(str)(char)(f16)(f32)(f64)(f128)]\b",
            ),
            // "foo" or b"foo" or r#"foo"#
            (TokenKind::String, r#"b?r?(#*)@("[(\\")[^("~)]]*("~))"#),
            // Characters
            (
                TokenKind::String,
                r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]'"#,
            ),
            (
                TokenKind::Operator,
                r"[(&(mut)?)(\?)(\+=?)(\-=?)(\*=?)(\/=?)(%=?)(!=?)(==?)(&&?=?)(\|\|?=?)(<<?=?)(>>?=?)(\.\.[\.=]?)\\\~\^:;,\@(=>?)]",
            ),
            // Fields and methods: a.foo
            (TokenKind::Property, r"\.[a-z_][A-Za-z0-9_]*"),
            // Paths: std::foo::bar
            (TokenKind::Property, r"[A-Za-z_][A-Za-z0-9_]*::"),
            // Lifetimes
            (TokenKind::Special, r"'[a-z_][A-Za-z0-9_]*\b"),
            (TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_]*\b"),
            (TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*"),
            (TokenKind::Delimiter, r"[\{\}\(\)\[\]]"),
            (TokenKind::Macro, r"[\{\}\(\)\[\]]"),
            (TokenKind::Attribute, r"#!?\[[^\]]*\]"),
        ])
    }

    pub fn clike(keyword: &str, r#type: &str, builtin: &str) -> Self {
        Self::new_from_regex([
            // Both kinds of comments match multiple lines
            (
                TokenKind::Doc,
                r"\/\/[\/!][^\n]*$(\n[[:space:]]\/\/[\/!][^\n]*$)*",
            ),
            (TokenKind::Comment, r"\/\/[^$]*$(\n[[:space:]]\/\/[^$]*$)*"),
            // Multi-line comment
            (TokenKind::Comment, r"\/\*[^(\*\/)]*\*\/"),
            (TokenKind::Keyword, keyword),
            (TokenKind::Macro, builtin),
            (TokenKind::Constant, r"\b[(true)(false)]\b"),
            // Flow-control operators count as keywords
            (TokenKind::Keyword, r"\b[(\.await)\?]\b"),
            (TokenKind::Constant, r"\b[A-Z][A-Z0-9_]+\b"),
            (TokenKind::Type, r"\b[A-Z][A-Za-z0-9_]*\b"),
            // Primitives
            (TokenKind::Type, r#type),
            // "foo" or b"foo" or r#"foo"#
            (TokenKind::String, r#"b?r?(#*)@("[(\\")[^("~)]]*("~))"#),
            // Character strings
            (
                TokenKind::String,
                r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]*'"#,
            ),
            (
                TokenKind::Operator,
                r"[(&)(\?)(\+\+)(\-\-)(\+=?)(\-=?)(\*=?)(\/=?)(%=?)(!=?)(==?)(&&?=?)(\|\|?=?)(<<?=?)(>>?=?)(\.\.[\.=]?)\\\~\^:;,\@(=>?)]",
            ),
            // Fields and methods: a.foo
            (TokenKind::Property, r"\.[a-z_][A-Za-z0-9_]*"),
            // Paths: std::foo::bar
            (TokenKind::Property, r"[A-Za-z_][A-Za-z0-9_]*::"),
            (TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_]*\b"),
            (TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*"),
            (TokenKind::Delimiter, r"[\{\}\(\)\[\]]"),
            // Preprocessor
            (TokenKind::Macro, r"^#[^$]*$"),
        ])
    }

    pub fn generic_clike() -> Self {
        Self::clike(
            // keyword
            r"\b[(var)(enum)(let)(this)(fn)(struct)(class)(import)(if)(while)(for)(in)(loop)(else)(break)(continue)(const)(static)(type)(extern)(return)(async)(throw)(catch)(union)(auto)(namespace)(public)(private)(function)(func)]\b",
            // types
            r"\b[(([(unsigned)(signed)][[:space:]])*u?int[0-9]*(_t)?)(float)(double)(bool)(char)(size_t)(void)]\b",
            "[]",
        )
    }

    pub fn glsl() -> Self {
        Self::clike(
            // keyword
            r"\b[(struct)(if)(while)(for)(else)(break)(continue)(const)(return)(layout)(uniform)(set)(binding)(location)(in)]\b",
            // types
            r"\b[(u?int)(float)(double)(bool)(void)([ui]?vec[1-4]*)([ui]?mat[1-4]*)(texture[(2D)(3D)]?(Cube)?)([ui]?sampler[(2D)(3D)]?(Shadow)?)]\b",
            // Builtins
            r"\b[(dot)(cross)(textureSize)(normalize)(texelFetch)(textureProj)(max)(min)(clamp)(reflect)(mix)(distance)(length)(abs)(pow)(sign)(sin)(cos)(tan)(fract)(mod)(round)(step)]\b",
        )
    }

    pub fn toml() -> Self {
        Self::new_from_regex([
            // Header
            (TokenKind::Doc, r#"^\[[^\n\]]*\]$"#),
            // Delimiters
            (TokenKind::Delimiter, r"[\{\}\(\)\[\]]"),
            // Operators
            (TokenKind::Operator, r"[=,]"),
            // Numbers
            (TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*"),
            // Double-quoted strings
            (
                TokenKind::String,
                r#"b?"[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^"]]*""#,
            ),
            // Single-quoted strings
            (
                TokenKind::String,
                r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]*'"#,
            ),
            // Booleans
            (TokenKind::Constant, r"\b[(true)(false)]\b"),
            // Identifier
            (TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_\-]*\b"),
            // Comments
            (TokenKind::Comment, r"#[^$]*$"),
        ])
    }

    fn highlight_str(&self, mut s: &[char]) -> Vec<Token> {
        let mut tokens = Vec::new();
        let mut i = 0;
        loop {
            let n = if let Some((idx, n)) = self
                .matchers
                .iter()
                .enumerate()
                .find_map(|(i, r)| Some((i, r.matches(s)?)))
            {
                tokens.push(Token {
                    kind: self.entries[idx],
                    range: i..i + n,
                });
                n
            } else if !s.is_empty() {
                1
            } else {
                break;
            };
            i += n;
            s = &s[n..];
        }
        tokens
    }

    pub fn highlight(self, s: &[char]) -> Highlights {
        let tokens = self.highlight_str(s);
        Highlights {
            highlighter: self,
            tokens,
        }
    }
}

pub struct Highlights {
    pub highlighter: Highlighter,
    tokens: Vec<Token>,
}

#[derive(Clone)]
pub struct Token {
    pub kind: TokenKind,
    pub range: Range<usize>,
}

impl Highlights {
    pub fn insert(&mut self, at: usize, s: &str) {}

    pub fn get_at(&self, pos: usize) -> Option<&Token> {
        let idx = self.tokens
            .binary_search_by_key(&pos, |tok| tok.range.start)
            // .ok()?
            .unwrap_or_else(|p| p.saturating_sub(1))
            // .saturating_sub(1)
        ;
        let tok = self.tokens.get(idx)?;
        if tok.range.contains(&pos) {
            Some(tok)
        } else {
            None
        }
    }
}

#[derive(Clone, Debug)]
pub enum Regex {
    Whitespace,
    WordBoundary,
    LineStart,
    LineEnd,
    LastDelim,
    Range(char, char),
    Char(char),
    Set(Vec<Self>),
    NegSet(Vec<Self>),
    Group(Vec<Self>),
    // (at_least, at_most, _)
    Many(usize, usize, Box<Self>),
    // (delimiter, x) - delimit x with `delimiter` on either side (used for raw strings)
    Delim(Box<Self>, Box<Self>),
}

struct State<'a> {
    s: &'a [char],
    pos: usize,
    delim: Option<&'a [char]>,
}

impl State<'_> {
    fn peek(&self) -> Option<char> {
        self.s.get(self.pos).copied()
    }

    fn prev(&self) -> Option<char> {
        self.s[..self.pos].last().copied()
        // self.s.get(self.pos.saturating_sub(1)).copied()
    }

    fn skip_if(&mut self, f: impl FnOnce(char) -> bool) -> Option<()> {
        self.peek().filter(|c| f(*c))?;
        self.pos += 1;
        Some(())
    }

    fn attempt(&mut self, r: &Regex) -> Option<()> {
        let old_pos = self.pos;
        if self.go(r).is_some() {
            Some(())
        } else {
            self.pos = old_pos;
            None
        }
    }

    fn go(&mut self, r: &Regex) -> Option<()> {
        match r {
            Regex::WordBoundary => {
                let is_word = |c: char| c.is_alphanumeric() || c == '_';
                (is_word(self.prev().unwrap_or(' ')) != is_word(self.peek().unwrap_or(' ')))
                    .then_some(())
            }
            Regex::LineStart => self.prev().map_or(true, |c| c == '\n').then_some(()),
            Regex::LineEnd => self.peek().map_or(true, |c| c == '\n').then_some(()),
            Regex::LastDelim => {
                if self.s[self.pos..].starts_with(self.delim?) {
                    self.pos += self.delim.unwrap().len();
                    Some(())
                } else {
                    None
                }
            }
            Regex::Char(x) => self.skip_if(|c| c == *x),
            Regex::Whitespace => {
                let mut once = false;
                while self.skip_if(|c| c.is_ascii_whitespace()).is_some() {
                    once = true;
                }
                once.then_some(())
            }
            Regex::NegSet(xs) => {
                if xs.iter().all(|x| self.attempt(x).is_none()) {
                    self.skip_if(|_| true)?;
                    Some(())
                } else {
                    None
                }
            }
            Regex::Set(xs) => xs.iter().find_map(|x| self.attempt(x)),
            Regex::Group(xs) => {
                for x in xs {
                    self.go(x)?;
                }
                Some(())
            }
            Regex::Range(a, b) => self.skip_if(|c| (a..=b).contains(&&c)),
            Regex::Many(at_least, at_most, x) => {
                let mut times = 0;
                loop {
                    let pos = self.pos;
                    if times >= *at_most || self.attempt(x).is_none() {
                        break (times >= *at_least).then_some(());
                    }
                    assert_ne!(pos, self.pos, "{x:?}");
                    times += 1;
                }
            }
            Regex::Delim(d, r) => {
                let old_pos = self.pos;
                self.go(d)?;
                let old_delim = self.delim.replace(&self.s[old_pos..self.pos]);
                let res = self.go(r);
                self.delim = old_delim;
                res
            }
        }
    }
}

impl Regex {
    fn matches(&self, s: &[char]) -> Option<usize> {
        let mut s = State {
            s,
            pos: 0,
            delim: None,
        };
        s.go(self).map(|_| s.pos)
    }
}

use chumsky::{
    pratt::{infix, left, postfix},
    prelude::*,
};

impl Regex {
    fn parser<'a>() -> impl Parser<'a, &'a str, Self, extra::Err<Rich<'a, char>>> {
        recursive(|regex| {
            let metachars = r"{}[]()^$.|*+-?\/@~";
            let char_ = choice((
                none_of(metachars),
                // Escaped meta characters
                just('\\').ignore_then(one_of(metachars)),
                just("\\n").to('\n'),
            ));

            let range = char_
                .then_ignore(just('-'))
                .then(char_)
                .map(|(a, b)| Self::Range(a, b));

            let items = regex.clone().repeated().collect();

            let atom = choice((
                range,
                char_.map(Self::Char),
                just("\\b").to(Self::WordBoundary),
                just("^").to(Self::LineStart),
                just("$").to(Self::LineEnd),
                just("~").to(Self::LastDelim),
                // Classes
                just("[[:space:]]").map(|_| Self::Whitespace),
                items
                    .clone()
                    .delimited_by(just("[^"), just(']'))
                    .map(Regex::NegSet),
                items
                    .clone()
                    .delimited_by(just('['), just(']'))
                    .map(Regex::Set),
                items
                    .clone()
                    .delimited_by(just('('), just(')'))
                    .map(Regex::Group),
            ));

            atom.pratt((
                postfix(1, just('*'), |r, _, _| Self::Many(0, !0, Box::new(r))),
                postfix(1, just('+'), |r, _, _| Self::Many(1, !0, Box::new(r))),
                postfix(1, just('?'), |r, _, _| Self::Many(0, 1, Box::new(r))),
                // Non-standard: `x@y` parses `x` and then `y`. `y` can use `~` to refer to the extra string that was
                // parsed by `x`. This supports nesting and is intended for context-sensitive patterns like Rust raw
                // strings.
                infix(left(0), just('@'), |d, _, r, _| {
                    Self::Delim(Box::new(d), Box::new(r))
                }),
            ))
        })
        .repeated()
        .collect()
        .map(Self::Group)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn simple() {
        let hl = Highlighter::rust().highlight("pub");
        assert_eq!(hl.tokens, Vec::new());
    }
}