Better Rust highlighter support, markdown support

This commit is contained in:
Joshua Barretto 2025-06-15 23:17:15 +01:00
parent a64884d894
commit 3e3755c0b5
6 changed files with 235 additions and 64 deletions

10
Cargo.lock generated
View file

@ -455,6 +455,15 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "unicode-display-width"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a43273b656140aa2bb8e65351fe87c255f0eca706b2538a9bd4a590a3490bf3"
dependencies = [
"unicode-segmentation",
]
[[package]] [[package]]
name = "unicode-ident" name = "unicode-ident"
version = "1.0.12" version = "1.0.12"
@ -648,4 +657,5 @@ dependencies = [
"crossterm", "crossterm",
"slotmap", "slotmap",
"thiserror", "thiserror",
"unicode-display-width",
] ]

View file

@ -9,6 +9,7 @@ slotmap = "1.0"
crossterm = "0.27" crossterm = "0.27"
thiserror = "1.0" thiserror = "1.0"
chumsky = { version = "0.10.1", features = ["pratt"] } chumsky = { version = "0.10.1", features = ["pratt"] }
unicode-display-width = "0.3.0"
[profile.dev] [profile.dev]
opt-level = 2 opt-level = 2

View file

@ -2,11 +2,36 @@ use std::{ops::Range, path::Path};
#[derive(Copy, Clone, Debug, PartialEq)] #[derive(Copy, Clone, Debug, PartialEq)]
pub enum TokenKind { pub enum TokenKind {
/// Non-structural whitespace
Whitespace, Whitespace,
/// Identifiers and names
Ident, Ident,
/// Syntax keywords
Keyword, Keyword,
/// Numeric literals
Number, Number,
/// Types or type definitions
Type, Type,
/// Comments, which have no effect on the code
Comment,
/// Documentation or doc comments
Doc,
/// Operators that perform work on operands
Operator,
/// Structural tokens (parentheses, braces, brackets, etc.)
Delimiter,
/// A field or method of another value (i.e: a named thing not present in the current namespace)
Property,
/// A special attribute or decorator attached to some other code
Attribute,
/// A macro, that transforms the code in some manner
Macro,
/// A string literal
String,
/// Misc special syntax (defined per-language)
Special,
/// A program constant or other statically-known name
Constant,
} }
pub struct Highlighter { pub struct Highlighter {
@ -26,32 +51,88 @@ impl Highlighter {
.map(|p| Regex::parser().parse(p.as_ref()).unwrap()) .map(|p| Regex::parser().parse(p.as_ref()).unwrap())
.collect(); .collect();
Self { Self { entries, matchers }
entries,
/*regex: meta::Regex::new_many(&patterns).unwrap(),*/ matchers,
}
} }
pub fn from_file_name(file_name: &Path) -> Option<Self> { pub fn from_file_name(file_name: &Path) -> Option<Self> {
match file_name.extension()?.to_str()? { match file_name.extension()?.to_str()? {
"rs" => Some(Self::rust()), "rs" => Some(Self::rust()),
"md" => Some(Self::markdown()),
_ => None, _ => None,
} }
} }
pub fn rust() -> Self { pub fn markdown() -> Self {
Self::new_from_regex([ Self::new_from_regex([
( // Links
TokenKind::Keyword, (TokenKind::String, r"\[[^\]]*\](\([^\)]*\))?"),
r"\b[(pub)(enum)(let)(self)(Self)(fn)(impl)(struct)(use)(if)(while)(for)(loop)(mod)]\b", // Header
), (TokenKind::Doc, r"^#+[[:space:]][^$]*$"),
(TokenKind::Ident, r"[a-z_][A-Za-z0-9_]*"), // List item
(TokenKind::Type, r"[A-Z_][A-Za-z0-9_]*"), (TokenKind::Operator, r"^[[:space:]]?[\-([0-9]+[\)\.])]"),
(TokenKind::Number, r"[0-9][A-Za-z0-9_]*"), // Bold
(TokenKind::Property, r"\*\*[^(\*\*)]*\*\*"),
// Italics
(TokenKind::Attribute, r"\*[^\*]*\*"),
// Code block
(TokenKind::Operator, r"^```[^(^```)]*^```"),
// Inline code
(TokenKind::Constant, r"`[^`$]*[`$]"),
// HTML
(TokenKind::Special, r"<[^<>]*>"),
]) ])
} }
fn highlight_str(&self, mut s: &str) -> Vec<(Range<usize>, TokenKind)> { pub fn rust() -> Self {
Self::new_from_regex([
(TokenKind::Doc, r"\/\/[\/!][^\n]*$"),
(TokenKind::Comment, r"\/\/[^$]*$"),
// Multi-line comment
(TokenKind::Comment, r"\/\*[^(\*\/)]*\*\/"),
(
TokenKind::Keyword,
r"\b[(pub)(enum)(let)(self)(Self)(fn)(impl)(struct)(use)(if)(while)(for)(in)(loop)(mod)(match)(else)(break)(continue)(trait)(const)(static)(type)(mut)(as)(crate)(extern)(move)(ref)(return)(super)(unsafe)(use)(where)(async)(dyn)(try)(gen)(macro_rules)(union)(raw)]\b",
),
(TokenKind::Constant, r"\b[(true)(false)]\b"),
// Flow-control operators count as keywords
(TokenKind::Keyword, r"\b[(\.await)\?]\b"),
// Macro invocations: println!
(TokenKind::Macro, r"\b[A-Za-z_][A-Za-z0-9_]*!"),
// Meta-variables
(TokenKind::Macro, r"\$[A-Za-z_][A-Za-z0-9_]*\b"),
(TokenKind::Constant, r"\b[A-Z][A-Z0-9_]+\b"),
(TokenKind::Type, r"\b[A-Z][A-Za-z0-9_]*\b"),
// Primitives
(
TokenKind::Type,
r"\b[(u8)(u16)(u32)(u64)(u128)(i8)(i16)(i32)(i64)(i128)(usize)(isize)(bool)(str)(char)]\b",
),
// "foo" or b"foo" or r#"foo"#
(TokenKind::String, r#"b?r?(#*)@("[(\\")[^("~)]]*("~))"#),
// Characters
(
TokenKind::String,
r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]'"#,
),
(
TokenKind::Operator,
r"[(&(mut)?)(\+=?)(\-=?)(\*=?)(\/=?)(%=?)(!=?)(==?)(&&?=?)(\|\|?=?)(<<?=?)(>>?=?)(\.\.[\.=]?)\\\~\^:;,\@(=>?)]",
),
// Fields and methods: a.foo
(TokenKind::Property, r"\.[a-z_][A-Za-z0-9_]*"),
// Paths: std::foo::bar
(TokenKind::Property, r"[A-Za-z_][A-Za-z0-9_]*::"),
// Lifetimes
(TokenKind::Special, r"'[a-z_][A-Za-z0-9_]*\b"),
(TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_]*\b"),
(TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*"),
(TokenKind::Delimiter, r"[\{\}\(\)\[\]]"),
(TokenKind::Macro, r"[\{\}\(\)\[\]]"),
(TokenKind::Attribute, r"#!?\[[^\]]*\]"),
])
}
fn highlight_str(&self, mut s: &[char]) -> Vec<(Range<usize>, TokenKind)> {
let mut tokens = Vec::new(); let mut tokens = Vec::new();
let mut i = 0; let mut i = 0;
loop { loop {
@ -63,8 +144,8 @@ impl Highlighter {
{ {
tokens.push((i..i + n, self.entries[idx])); tokens.push((i..i + n, self.entries[idx]));
n n
} else if let Some((n, _)) = s.char_indices().nth(1) { } else if !s.is_empty() {
n 1
} else { } else {
break; break;
}; };
@ -74,7 +155,7 @@ impl Highlighter {
tokens tokens
} }
pub fn highlight(self, s: &str) -> Highlights { pub fn highlight(self, s: &[char]) -> Highlights {
let tokens = self.highlight_str(s); let tokens = self.highlight_str(s);
Highlights { Highlights {
highlighter: self, highlighter: self,
@ -107,32 +188,40 @@ impl Highlights {
pub enum Regex { pub enum Regex {
Whitespace, Whitespace,
WordBoundary, WordBoundary,
LineStart,
LineEnd,
LastDelim,
Range(char, char), Range(char, char),
Char(char), Char(char),
Set(Vec<Self>), Set(Vec<Self>),
NegSet(Vec<Self>),
Group(Vec<Self>), Group(Vec<Self>),
// (at_least, _) // (at_least, at_most, _)
Many(usize, Box<Self>), Many(usize, usize, Box<Self>),
// (delimiter, x) - delimit x with `delimiter` on either side (used for raw strings)
Delim(Box<Self>, Box<Self>),
} }
struct State<'a> { struct State<'a> {
s: &'a str, s: &'a [char],
pos: usize, pos: usize,
delim: Option<&'a [char]>,
} }
impl State<'_> { impl State<'_> {
fn peek(&self) -> Option<char> { fn peek(&self) -> Option<char> {
self.s[self.pos..].chars().next() self.s.get(self.pos).copied()
} }
fn prev(&self) -> Option<char> { fn prev(&self) -> Option<char> {
self.s[..self.pos].chars().rev().next() self.s[..self.pos].last().copied()
// self.s.get(self.pos.saturating_sub(1)).copied()
} }
fn skip(&mut self) { fn skip_if(&mut self, f: impl FnOnce(char) -> bool) -> Option<()> {
if let Some(c) = self.peek() { self.peek().filter(|c| f(*c))?;
self.pos += c.len_utf8(); self.pos += 1;
} Some(())
} }
fn attempt(&mut self, r: &Regex) -> Option<()> { fn attempt(&mut self, r: &Regex) -> Option<()> {
@ -152,26 +241,32 @@ impl State<'_> {
(is_word(self.prev().unwrap_or(' ')) != is_word(self.peek().unwrap_or(' '))) (is_word(self.prev().unwrap_or(' ')) != is_word(self.peek().unwrap_or(' ')))
.then_some(()) .then_some(())
} }
Regex::Char(c) => { Regex::LineStart => self.prev().map_or(true, |c| c == '\n').then_some(()),
if self.peek()? == *c { Regex::LineEnd => self.peek().map_or(true, |c| c == '\n').then_some(()),
self.skip(); Regex::LastDelim => {
if self.s[self.pos..].starts_with(self.delim?) {
self.pos += self.delim.unwrap().len();
Some(()) Some(())
} else { } else {
None None
} }
} }
Regex::Char(x) => self.skip_if(|c| c == *x),
Regex::Whitespace => { Regex::Whitespace => {
let mut once = false; let mut once = false;
while let Some(c) = self.peek() { while self.skip_if(|c| c.is_ascii_whitespace()).is_some() {
if c.is_ascii_whitespace() {
self.skip();
once = true; once = true;
} else {
break;
}
} }
once.then_some(()) once.then_some(())
} }
Regex::NegSet(xs) => {
if xs.iter().all(|x| self.attempt(x).is_none()) {
self.skip_if(|_| true)?;
Some(())
} else {
None
}
}
Regex::Set(xs) => xs.iter().find_map(|x| self.attempt(x)), Regex::Set(xs) => xs.iter().find_map(|x| self.attempt(x)),
Regex::Group(xs) => { Regex::Group(xs) => {
for x in xs { for x in xs {
@ -179,72 +274,101 @@ impl State<'_> {
} }
Some(()) Some(())
} }
Regex::Range(a, b) => { Regex::Range(a, b) => self.skip_if(|c| (a..=b).contains(&&c)),
if (a..=b).contains(&&self.peek()?) { Regex::Many(at_least, at_most, x) => {
self.skip();
Some(())
} else {
None
}
}
Regex::Many(at_least, x) => {
let mut times = 0; let mut times = 0;
loop { loop {
if self.attempt(x).is_none() { let pos = self.pos;
if times >= *at_most {
break;
} else if self.attempt(x).is_none() {
break; break;
} }
assert_ne!(pos, self.pos, "{x:?}");
times += 1; times += 1;
} }
if times >= *at_least { Some(()) } else { None } if times >= *at_least { Some(()) } else { None }
} }
r => todo!("{r:?}"), Regex::Delim(d, r) => {
let old_pos = self.pos;
self.go(d)?;
let old_delim = self.delim.replace(&self.s[old_pos..self.pos]);
let res = self.go(r);
self.delim = old_delim;
res
}
} }
} }
} }
impl Regex { impl Regex {
fn matches(&self, s: &str) -> Option<usize> { fn matches(&self, s: &[char]) -> Option<usize> {
let mut s = State { s, pos: 0 }; let mut s = State {
s,
pos: 0,
delim: None,
};
s.go(self).map(|_| s.pos) s.go(self).map(|_| s.pos)
} }
} }
use chumsky::{pratt::postfix, prelude::*}; use chumsky::{
pratt::{infix, left, postfix},
prelude::*,
};
impl Regex { impl Regex {
fn parser<'a>() -> impl Parser<'a, &'a str, Self, extra::Err<Rich<'a, char>>> { fn parser<'a>() -> impl Parser<'a, &'a str, Self, extra::Err<Rich<'a, char>>> {
recursive(|regex| { recursive(|regex| {
let char_ = any().filter(|c: &char| c.is_alphanumeric() || *c == '_'); let metachars = r"{}[]()^$.|*+-?\/@~";
let char_ = choice((
none_of(metachars),
// Escaped meta characters
just('\\').ignore_then(one_of(metachars)),
just("\\n").to('\n'),
));
let range = char_ let range = char_
.then_ignore(just('-')) .then_ignore(just('-'))
.then(char_) .then(char_)
.map(|(a, b)| Self::Range(a, b)); .map(|(a, b)| Self::Range(a, b));
let items = regex.clone().repeated().collect();
let atom = choice(( let atom = choice((
range, range,
char_.map(Self::Char), char_.map(Self::Char),
just("\\b").to(Self::WordBoundary), just("\\b").to(Self::WordBoundary),
just("^").to(Self::LineStart),
just("$").to(Self::LineEnd),
just("~").to(Self::LastDelim),
// Classes // Classes
just("[[:space:]]").map(|_| Self::Whitespace), just("[[:space:]]").map(|_| Self::Whitespace),
regex items
.clone()
.delimited_by(just("[^"), just(']'))
.map(Regex::NegSet),
items
.clone() .clone()
.repeated()
.collect()
.delimited_by(just('['), just(']')) .delimited_by(just('['), just(']'))
.map(Regex::Set), .map(Regex::Set),
regex items
.clone() .clone()
.repeated()
.collect()
.delimited_by(just('('), just(')')) .delimited_by(just('('), just(')'))
.map(Regex::Group), .map(Regex::Group),
)); ));
atom.pratt(( atom.pratt((
postfix(0, just('*'), |r, _, _| Self::Many(0, Box::new(r))), postfix(1, just('*'), |r, _, _| Self::Many(0, !0, Box::new(r))),
postfix(0, just('+'), |r, _, _| Self::Many(1, Box::new(r))), postfix(1, just('+'), |r, _, _| Self::Many(1, !0, Box::new(r))),
postfix(1, just('?'), |r, _, _| Self::Many(0, 1, Box::new(r))),
// Non-standard: `x@y` parses `x` and then `y`. `y` can use `~` to refer to the extra string that was
// parsed by `x`. This supports nesting and is intended for context-sensitive patterns like Rust raw
// strings.
infix(left(0), just('@'), |d, _, r, _| {
Self::Delim(Box::new(d), Box::new(r))
}),
)) ))
}) })
.repeated() .repeated()

View file

@ -49,6 +49,11 @@ impl ToString for Text {
} }
impl Text { impl Text {
// TODO: Remove this
pub fn chars(&self) -> &[char] {
&self.chars
}
pub fn to_coord(&self, pos: usize) -> [isize; 2] { pub fn to_coord(&self, pos: usize) -> [isize; 2] {
let mut n = 0; let mut n = 0;
let mut last_n = 0; let mut last_n = 0;
@ -138,8 +143,8 @@ impl Buffer {
Err(err) => return Err(err.into()), Err(err) => return Err(err.into()),
}; };
Ok(Self { Ok(Self {
highlights: Highlighter::from_file_name(&path).map(|h| h.highlight(&chars)),
text: Text { chars }, text: Text { chars },
highlights: Highlighter::from_file_name(&path).map(|h| h.highlight(&s)),
cursors: HopSlotMap::default(), cursors: HopSlotMap::default(),
dir, dir,
path: Some(path), path: Some(path),
@ -159,7 +164,7 @@ impl Buffer {
self.highlights = self self.highlights = self
.highlights .highlights
.take() .take()
.map(|hl| hl.highlighter.highlight(&self.text.to_string())); .map(|hl| hl.highlighter.highlight(self.text.chars()));
} }
pub fn clear(&mut self) { pub fn clear(&mut self) {

View file

@ -354,7 +354,8 @@ impl<'a> Terminal<'a> {
stdout.queue(style::Print(c)).unwrap(); stdout.queue(style::Print(c)).unwrap();
// Move cursor // Move cursor
cursor_pos[0] += 1; cursor_pos[0] +=
unicode_display_width::width(c.encode_utf8(&mut [0; 4])) as u16;
} }
} }
} }

View file

@ -51,6 +51,16 @@ pub struct Theme {
pub hl_token_keyword: Color, pub hl_token_keyword: Color,
pub hl_token_number: Color, pub hl_token_number: Color,
pub hl_token_type: Color, pub hl_token_type: Color,
pub hl_token_comment: Color,
pub hl_token_operator: Color,
pub hl_token_delimiter: Color,
pub hl_token_doc: Color,
pub hl_token_attribute: Color,
pub hl_token_property: Color,
pub hl_token_macro: Color,
pub hl_token_string: Color,
pub hl_token_special: Color,
pub hl_token_constant: Color,
} }
impl Default for Theme { impl Default for Theme {
@ -73,10 +83,20 @@ impl Default for Theme {
option_new: Color::AnsiValue(148), option_new: Color::AnsiValue(148),
hl_token_whitespace: Color::Reset, hl_token_whitespace: Color::Reset,
hl_token_ident: Color::AnsiValue(187), hl_token_ident: Color::AnsiValue(7),
hl_token_keyword: Color::AnsiValue(46), hl_token_keyword: Color::AnsiValue(112),
hl_token_number: Color::AnsiValue(45), hl_token_number: Color::AnsiValue(45),
hl_token_type: Color::AnsiValue(203), hl_token_type: Color::AnsiValue(210),
hl_token_comment: Color::AnsiValue(145),
hl_token_operator: Color::AnsiValue(111),
hl_token_delimiter: Color::AnsiValue(37),
hl_token_doc: Color::AnsiValue(180),
hl_token_attribute: Color::AnsiValue(146),
hl_token_property: Color::AnsiValue(152),
hl_token_macro: Color::AnsiValue(117),
hl_token_string: Color::AnsiValue(179),
hl_token_special: Color::AnsiValue(160),
hl_token_constant: Color::AnsiValue(81),
} }
} }
} }
@ -89,6 +109,16 @@ impl Theme {
TokenKind::Keyword => self.hl_token_keyword, TokenKind::Keyword => self.hl_token_keyword,
TokenKind::Number => self.hl_token_number, TokenKind::Number => self.hl_token_number,
TokenKind::Type => self.hl_token_type, TokenKind::Type => self.hl_token_type,
TokenKind::Comment => self.hl_token_comment,
TokenKind::Operator => self.hl_token_operator,
TokenKind::Delimiter => self.hl_token_delimiter,
TokenKind::Doc => self.hl_token_doc,
TokenKind::Attribute => self.hl_token_attribute,
TokenKind::Property => self.hl_token_property,
TokenKind::Macro => self.hl_token_macro,
TokenKind::String => self.hl_token_string,
TokenKind::Special => self.hl_token_special,
TokenKind::Constant => self.hl_token_constant,
} }
} }
} }