Generalised highlight config to langpacks

This commit is contained in:
Joshua Barretto 2025-09-23 21:28:59 +01:00
parent 0a13d5c1f1
commit aee42780be
5 changed files with 265 additions and 220 deletions

View file

@ -34,8 +34,8 @@ pub enum TokenKind {
Constant,
}
#[derive(Default)]
pub struct Highlighter {
// regex: meta::Regex,
matchers: Vec<Regex>,
entries: Vec<TokenKind>,
}
@ -55,187 +55,19 @@ impl Highlighter {
}
pub fn with(mut self, token: TokenKind, p: impl AsRef<str>) -> Self {
self.entries.push(token);
self.matchers
.push(Regex::parser().parse(p.as_ref()).unwrap());
self
self.with_many([(token, p)])
}
pub fn from_file_name(file_name: &Path) -> Option<Self> {
match file_name.extension()?.to_str()? {
"rs" => Some(Self::rust()),
"md" => Some(Self::markdown()),
"toml" => Some(Self::toml()),
"c" | "h" | "cpp" | "hpp" | "cxx" | "js" | "ts" | "go" => Some(Self::generic_clike()),
"glsl" | "vert" | "frag" => Some(Self::glsl()),
_ => None,
pub fn with_many<P: AsRef<str>>(
mut self,
patterns: impl IntoIterator<Item = (TokenKind, P)>,
) -> Self {
for (token, p) in patterns {
self.entries.push(token);
self.matchers
.push(Regex::parser().parse(p.as_ref()).unwrap());
}
}
pub fn markdown() -> Self {
Self::new_from_regex([
// Links
(TokenKind::String, r"\[[^\]]*\](\([^\)]*\))?"),
// Header
(TokenKind::Doc, r"^#+[[:space:]][^$]*$"),
// List item
(TokenKind::Operator, r"^[[:space:]]?[\-([0-9]+[\)\.])]"),
// Bold
(TokenKind::Property, r"\*\*[^(\*\*)]*\*\*"),
// Italics
(TokenKind::Attribute, r"\*[^\*]*\*"),
// Code block
(TokenKind::Operator, r"^```[^(^```)]*^```"),
// Inline code
(TokenKind::Constant, r"`[^`$]*[`$]"),
// HTML
(TokenKind::Special, r"<[^<>]*>"),
])
}
pub fn rust() -> Self {
Self::new_from_regex([
// Both kinds of comments match multiple lines
(
TokenKind::Doc,
r"\/\/[\/!][^\n]*$(\n[[:space:]]\/\/[\/!][^\n]*$)*",
),
(TokenKind::Comment, r"\/\/[^$]*$(\n[[:space:]]\/\/[^$]*$)*"),
// Multi-line comment
(TokenKind::Comment, r"\/\*[^(\*\/)]*\*\/"),
(
TokenKind::Keyword,
r"\b[(pub)(enum)(let)(self)(Self)(fn)(impl)(struct)(use)(if)(while)(for)(in)(loop)(mod)(match)(else)(break)(continue)(trait)(const)(static)(type)(mut)(as)(crate)(extern)(move)(ref)(return)(super)(unsafe)(use)(where)(async)(dyn)(try)(gen)(macro_rules)(union)(raw)]\b",
),
(TokenKind::Constant, r"\b[(true)(false)]\b"),
// Flow-control operators count as keywords
(TokenKind::Keyword, r"\.await\b"),
// Macro invocations: println!
(TokenKind::Macro, r"\b[A-Za-z_][A-Za-z0-9_]*!"),
// Meta-variables
(TokenKind::Macro, r"\$[A-Za-z_][A-Za-z0-9_]*\b"),
(TokenKind::Constant, r"\b[A-Z][A-Z0-9_]+\b"),
(TokenKind::Type, r"\b[A-Z][A-Za-z0-9_]*\b"),
// Primitives
(
TokenKind::Type,
r"\b[(u8)(u16)(u32)(u64)(u128)(i8)(i16)(i32)(i64)(i128)(usize)(isize)(bool)(str)(char)(f16)(f32)(f64)(f128)]\b",
),
// "foo" or b"foo" or r#"foo"#
(TokenKind::String, r#"b?r?(#*)@("[(\\")[^("~)]]*("~))"#),
// Characters
(
TokenKind::String,
r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]'"#,
),
(
TokenKind::Operator,
r"[(&(mut)?)(\?)(\+=?)(\-=?)(\*=?)(\/=?)(%=?)(!=?)(==?)(&&?=?)(\|\|?=?)(<<?=?)(>>?=?)(\.\.[\.=]?)\\\~\^:;,\@(=>?)]",
),
// Fields and methods: a.foo
(TokenKind::Property, r"\.[a-z_][A-Za-z0-9_]*"),
// Paths: std::foo::bar
(TokenKind::Property, r"[A-Za-z_][A-Za-z0-9_]*::"),
// Lifetimes
(TokenKind::Special, r"'[a-z_][A-Za-z0-9_]*\b"),
(TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_]*\b"),
(TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*"),
(TokenKind::Delimiter, r"[\{\}\(\)\[\]]"),
(TokenKind::Macro, r"[\{\}\(\)\[\]]"),
(TokenKind::Attribute, r"#!?\[[^\]]*\]"),
])
}
pub fn clike(keyword: &str, r#type: &str, builtin: &str) -> Self {
Self::new_from_regex([
// Both kinds of comments match multiple lines
(
TokenKind::Doc,
r"\/\/[\/!][^\n]*$(\n[[:space:]]\/\/[\/!][^\n]*$)*",
),
(TokenKind::Comment, r"\/\/[^$]*$(\n[[:space:]]\/\/[^$]*$)*"),
// Multi-line comment
(TokenKind::Comment, r"\/\*[^(\*\/)]*\*\/"),
(TokenKind::Keyword, keyword),
(TokenKind::Macro, builtin),
(TokenKind::Constant, r"\b[(true)(false)]\b"),
// Flow-control operators count as keywords
(TokenKind::Keyword, r"\b[(\.await)\?]\b"),
(TokenKind::Constant, r"\b[A-Z][A-Z0-9_]+\b"),
(TokenKind::Type, r"\b[A-Z][A-Za-z0-9_]*\b"),
// Primitives
(TokenKind::Type, r#type),
// "foo" or b"foo" or r#"foo"#
(TokenKind::String, r#"b?r?(#*)@("[(\\")[^("~)]]*("~))"#),
// Character strings
(
TokenKind::String,
r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]*'"#,
),
(
TokenKind::Operator,
r"[(&)(\?)(\+\+)(\-\-)(\+=?)(\-=?)(\*=?)(\/=?)(%=?)(!=?)(==?)(&&?=?)(\|\|?=?)(<<?=?)(>>?=?)(\.\.[\.=]?)\\\~\^:;,\@(=>?)]",
),
// Fields and methods: a.foo
(TokenKind::Property, r"\.[a-z_][A-Za-z0-9_]*"),
// Paths: std::foo::bar
(TokenKind::Property, r"[A-Za-z_][A-Za-z0-9_]*::"),
(TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_]*\b"),
(TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*"),
(TokenKind::Delimiter, r"[\{\}\(\)\[\]]"),
// Preprocessor
(TokenKind::Macro, r"^#[^$]*$"),
])
}
pub fn generic_clike() -> Self {
Self::clike(
// keyword
r"\b[(var)(enum)(let)(this)(fn)(struct)(class)(import)(if)(while)(for)(in)(loop)(else)(break)(continue)(const)(static)(type)(extern)(return)(async)(throw)(catch)(union)(auto)(namespace)(public)(private)(function)(func)]\b",
// types
r"\b[(([(unsigned)(signed)][[:space:]])*u?int[0-9]*(_t)?)(float)(double)(bool)(char)(size_t)(void)]\b",
"[]",
)
}
pub fn glsl() -> Self {
Self::clike(
// keyword
r"\b[(struct)(if)(while)(for)(else)(break)(continue)(const)(return)(layout)(uniform)(set)(binding)(location)(in)]\b",
// types
r"\b[(u?int)(float)(double)(bool)(void)([ui]?vec[1-4]*)([ui]?mat[1-4]*)(texture[(2D)(3D)]?(Cube)?)([ui]?sampler[(2D)(3D)]?(Shadow)?)]\b",
// Builtins
r"\b[(dot)(cross)(textureSize)(normalize)(texelFetch)(textureProj)(max)(min)(clamp)(reflect)(mix)(distance)(length)(abs)(pow)(sign)(sin)(cos)(tan)(fract)(mod)(round)(step)]\b",
)
}
pub fn toml() -> Self {
Self::new_from_regex([
// Header
(TokenKind::Doc, r#"^\[[^\n\]]*\]$"#),
// Delimiters
(TokenKind::Delimiter, r"[\{\}\(\)\[\]]"),
// Operators
(TokenKind::Operator, r"[=,]"),
// Numbers
(TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*"),
// Double-quoted strings
(
TokenKind::String,
r#"b?"[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^"]]*""#,
),
// Single-quoted strings
(
TokenKind::String,
r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]*'"#,
),
// Booleans
(TokenKind::Constant, r"\b[(true)(false)]\b"),
// Identifier
(TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_\-]*\b"),
// Comments
(TokenKind::Comment, r"#[^$]*$"),
])
self
}
fn highlight_str(&self, mut s: &[char]) -> Vec<Token> {
@ -264,17 +96,14 @@ impl Highlighter {
tokens
}
pub fn highlight(self, s: &[char]) -> Highlights {
pub fn highlight(&self, s: &[char]) -> Highlights {
let tokens = self.highlight_str(s);
Highlights {
highlighter: self,
tokens,
}
Highlights { tokens }
}
}
#[derive(Default)]
pub struct Highlights {
pub highlighter: Highlighter,
tokens: Vec<Token>,
}

221
src/lang/mod.rs Normal file
View file

@ -0,0 +1,221 @@
use super::*;
use crate::highlight::{Highlighter, TokenKind};
use std::path::Path;
#[derive(Default)]
pub struct LangPack {
pub highlighter: Highlighter,
pub comment_syntax: Option<Vec<char>>,
}
impl LangPack {
pub fn from_file_name(file_name: &Path) -> Self {
match file_name.extension().and_then(|e| e.to_str()).unwrap_or("") {
"rs" => Self {
highlighter: Highlighter::default().rust(),
comment_syntax: Some(vec!['/', '/', ' ']),
},
"md" => Self {
highlighter: Highlighter::default().markdown(),
comment_syntax: None,
},
"toml" => Self {
highlighter: Highlighter::default().toml(),
comment_syntax: Some(vec!['#', ' ']),
},
"c" | "h" | "cpp" | "hpp" | "cxx" | "js" | "ts" | "go" => Self {
highlighter: Highlighter::default().generic_clike(),
comment_syntax: Some(vec!['/', '/', ' ']),
},
"glsl" | "vert" | "frag" => Self {
highlighter: Highlighter::default().glsl(),
comment_syntax: Some(vec!['/', '/', ' ']),
},
"py" => Self {
highlighter: Highlighter::default().python(),
comment_syntax: Some(vec!['#', ' ']),
},
_ => Self {
highlighter: Highlighter::default(),
comment_syntax: None,
},
}
}
}
impl Highlighter {
pub fn markdown(self) -> Self {
self
// Links
.with(TokenKind::String, r"\[[^\]]*\](\([^\)]*\))?")
// Header
.with(TokenKind::Doc, r"^#+[[:space:]][^$]*$")
// List item
.with(TokenKind::Operator, r"^[[:space:]]?[\-([0-9]+[\)\.])]")
// Bold
.with(TokenKind::Property, r"\*\*[^(\*\*)]*\*\*")
// Italics
.with(TokenKind::Attribute, r"\*[^\*]*\*")
// Code block
.with(TokenKind::Operator, r"^```[^(^```)]*^```")
// Inline code
.with(TokenKind::Constant, r"`[^`$]*[`$]")
// HTML
.with(TokenKind::Special, r"<[^<>]*>")
}
pub fn rust(self) -> Self {
self
// Both kinds of comments match multiple lines
.with(TokenKind::Doc, r"\/\/[\/!][^\n]*$(\n[[:space:]]\/\/[\/!][^\n]*$)*")
.with(TokenKind::Comment, r"\/\/[^$]*$(\n[[:space:]]\/\/[^$]*$)*")
// Multi-line comment
.with(TokenKind::Comment, r"\/\*[^(\*\/)]*\*\/")
.with(
TokenKind::Keyword,
r"\b[(pub)(enum)(let)(self)(Self)(fn)(impl)(struct)(use)(if)(while)(for)(in)(loop)(mod)(match)(else)(break)(continue)(trait)(const)(static)(type)(mut)(as)(crate)(extern)(move)(ref)(return)(super)(unsafe)(use)(where)(async)(dyn)(try)(gen)(macro_rules)(union)(raw)]\b",
)
.with(TokenKind::Constant, r"\b[(true)(false)]\b")
// Flow-control operators count as keywords
.with(TokenKind::Keyword, r"\.await\b")
// Macro invocations: println!
.with(TokenKind::Macro, r"\b[A-Za-z_][A-Za-z0-9_]*!")
// Meta-variables
.with(TokenKind::Macro, r"\$[A-Za-z_][A-Za-z0-9_]*\b")
.with(TokenKind::Constant, r"\b[A-Z][A-Z0-9_]+\b")
.with(TokenKind::Type, r"\b[A-Z][A-Za-z0-9_]*\b")
// Primitives
.with(
TokenKind::Type,
r"\b[(u8)(u16)(u32)(u64)(u128)(i8)(i16)(i32)(i64)(i128)(usize)(isize)(bool)(str)(char)(f16)(f32)(f64)(f128)]\b",
)
// "foo" or b"foo" or r#"foo"#
.with(TokenKind::String, r#"b?r?(#*)@("[(\\")[^("~)]]*("~))"#)
// Characters
.with(
TokenKind::String,
r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]'"#,
)
.with(
TokenKind::Operator,
r"[(&(mut)?)(\?)(\+=?)(\-=?)(\*=?)(\/=?)(%=?)(!=?)(==?)(&&?=?)(\|\|?=?)(<<?=?)(>>?=?)(\.\.[\.=]?)\\\~\^:;,\@(=>?)]",
)
// Fields and methods: a.foo
.with(TokenKind::Property, r"\.[a-z_][A-Za-z0-9_]*")
// Paths: std::foo::bar
.with(TokenKind::Property, r"[A-Za-z_][A-Za-z0-9_]*::")
// Lifetimes
.with(TokenKind::Special, r"'[a-z_][A-Za-z0-9_]*\b")
.with(TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_]*\b")
.with(TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*")
.with(TokenKind::Delimiter, r"[\{\}\(\)\[\]]")
.with(TokenKind::Macro, r"[\{\}\(\)\[\]]")
.with(TokenKind::Attribute, r"#!?\[[^\]]*\]")
}
fn clike_comments(self) -> Self {
self
// Both kinds of comments match multiple lines
.with(
TokenKind::Doc,
r"\/\/[\/!][^\n]*$(\n[[:space:]]\/\/[\/!][^\n]*$)*",
)
// Regular comment
.with(TokenKind::Comment, r"\/\/[^$]*$(\n[[:space:]]\/\/[^$]*$)*")
// Multi-line comment
.with(TokenKind::Comment, r"\/\*[^(\*\/)]*\*\/")
}
fn clike_preprocessor(self) -> Self {
self.with(TokenKind::Macro, r"^#[^$]*$")
}
pub fn clike(self) -> Self {
self
.with(TokenKind::Constant, r"\b[(true)(false)]\b")
.with(TokenKind::Constant, r"\b[A-Z][A-Z0-9_]+\b")
.with(TokenKind::Type, r"\b[A-Z][A-Za-z0-9_]*\b")
// "foo" or b"foo" or r#"foo"#
.with(TokenKind::String, r#"b?r?(#*)@("[(\\")[^("~)]]*("~))"#)
// Character strings
.with(TokenKind::String, r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]*'"#)
.with(
TokenKind::Operator,
r"[(&)(\?)(\+\+)(\-\-)(\+=?)(\-=?)(\*=?)(\/=?)(%=?)(!=?)(==?)(&&?=?)(\|\|?=?)(<<?=?)(>>?=?)(\.\.[\.=]?)\\\~\^:;,\@(=>?)]",
)
// Fields and methods: a.foo
.with(TokenKind::Property, r"\.[a-z_][A-Za-z0-9_]*")
// Paths: std::foo::bar
.with(TokenKind::Property, r"[A-Za-z_][A-Za-z0-9_]*::")
.with(TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_]*\b")
.with(TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*")
.with(TokenKind::Delimiter, r"[\{\}\(\)\[\]]")
}
pub fn generic_clike(self) -> Self {
self
// Keywords
.with(TokenKind::Keyword, r"\b[(var)(enum)(let)(this)(fn)(struct)(class)(import)(if)(while)(for)(in)(loop)(else)(break)(continue)(const)(static)(type)(extern)(return)(async)(throw)(catch)(union)(auto)(namespace)(public)(private)(function)(func)]\b")
// Primitives
.with(TokenKind::Type, r"\b[(([(unsigned)(signed)][[:space:]])*u?int[0-9]*(_t)?)(float)(double)(bool)(char)(size_t)(void)]\b")
.clike_comments()
.clike_preprocessor()
.clike()
}
pub fn glsl(self) -> Self {
self
// Keywords
.with(TokenKind::Keyword, r"\b[(struct)(if)(while)(for)(else)(break)(continue)(const)(return)(layout)(uniform)(set)(binding)(location)(in)]\b")
// Primitives
.with(TokenKind::Type, r"\b[(u?int)(float)(double)(bool)(void)([ui]?vec[1-4]*)([ui]?mat[1-4]*)(texture[(2D)(3D)]?(Cube)?)([ui]?sampler[(2D)(3D)]?(Shadow)?)]\b")
// Builtins
.with(TokenKind::Macro, r"\b[(dot)(cross)(textureSize)(normalize)(texelFetch)(textureProj)(max)(min)(clamp)(reflect)(mix)(distance)(length)(abs)(pow)(sign)(sin)(cos)(tan)(fract)(mod)(round)(step)]\b")
.clike_comments()
.clike_preprocessor()
.clike()
}
pub fn python(self) -> Self {
self
// Keywords
.with(TokenKind::Keyword, r"\b[(and)(as)(assert)(break)(class)(continue)(def)(del)(elif)(else)(except)(finally)(for)(from)(global)(if)(import)(in)(is)(lambda)(nonlocal)(not)(or)(pass)(raise)(return)(try)(while)(with)(yield)]\b")
// Primitives
.with(TokenKind::Type, r"\b[]\b")
// Builtins
.with(TokenKind::Macro, r"\b[(True)(False)(None)]\b")
// Doc comments
.with(TokenKind::Doc, r"^##[^$]*$")
// Comments
.with(TokenKind::Comment, r"^#[^$]*$")
.clike()
}
pub fn toml(self) -> Self {
self
// Header
.with(TokenKind::Doc, r#"^\[[^\n\]]*\]$"#)
// Delimiters
.with(TokenKind::Delimiter, r"[\{\}\(\)\[\]]")
// Operators
.with(TokenKind::Operator, r"[=,]")
// Numbers
.with(TokenKind::Number, r"[0-9][A-Za-z0-9_\.]*")
// Double-quoted strings
.with(
TokenKind::String,
r#"b?"[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^"]]*""#,
)
// Single-quoted strings
.with(
TokenKind::String,
r#"b?'[(\\[nrt\\0(x[0-7A-Za-z][0-7A-Za-z])])[^']]*'"#,
)
// Booleans
.with(TokenKind::Constant, r"\b[(true)(false)]\b")
// Identifier
.with(TokenKind::Ident, r"\b[a-z_][A-Za-z0-9_\-]*\b")
// Comments
.with(TokenKind::Comment, r"#[^$]*$")
}
}

View file

@ -1,5 +1,6 @@
mod action;
mod highlight;
mod lang;
mod state;
mod terminal;
mod theme;

View file

@ -1,8 +1,4 @@
use crate::{
Args, Dir, Error,
highlight::{Highlighter, Highlights},
theme,
};
use crate::{Args, Dir, Error, highlight::Highlights, lang::LangPack, theme};
use clipboard::{ClipboardContext, ClipboardProvider};
use slotmap::{HopSlotMap, new_key_type};
use std::{
@ -145,7 +141,8 @@ impl Text {
pub struct Buffer {
pub unsaved: bool,
pub text: Text,
pub highlights: Option<Highlights>,
pub lang: LangPack,
pub highlights: Highlights,
pub cursors: HopSlotMap<CursorId, Cursor>,
pub dir: Option<PathBuf>,
pub path: Option<PathBuf>,
@ -198,9 +195,11 @@ impl Buffer {
}
Err(err) => return Err(err.into()),
};
let lang = LangPack::from_file_name(&path);
Ok(Self {
unsaved,
highlights: Highlighter::from_file_name(&path).map(|h| h.highlight(&chars)),
highlights: lang.highlighter.highlight(&chars),
lang,
text: Text { chars },
cursors: HopSlotMap::default(),
dir,
@ -233,10 +232,7 @@ impl Buffer {
}
fn update_highlights(&mut self) {
self.highlights = self
.highlights
.take()
.map(|hl| hl.highlighter.highlight(self.text.chars()));
self.highlights = self.lang.highlighter.highlight(self.text.chars());
}
pub fn reset(&mut self) {
@ -266,24 +262,20 @@ impl Buffer {
let Some(cursor) = self.cursors.get_mut(cursor_id) else {
return;
};
if let Some(tok) = self
.highlights
.as_ref()
// Choose the longest token that the cursor is touching
.and_then(|hl| {
let a = hl.get_at(cursor.pos);
let b = hl.get_at(cursor.pos.saturating_sub(1));
a.zip(b)
.map(|(a, b)| {
if a.range.end - a.range.start > b.range.end - b.range.start {
a
} else {
b
}
})
.or(a)
.or(b)
let a = self.highlights.get_at(cursor.pos);
let b = self.highlights.get_at(cursor.pos.saturating_sub(1));
if let Some(tok) = a
.zip(b)
.map(|(a, b)| {
if a.range.end - a.range.start > b.range.end - b.range.start {
a
} else {
b
}
})
.or(a)
.or(b)
{
cursor.select(tok.range.clone());
} else {
@ -797,6 +789,10 @@ impl Buffer {
let Some(cursor) = self.cursors.get_mut(cursor_id) else {
return;
};
let Some(comment_syntax) = self.lang.comment_syntax.clone() else {
return;
};
let lines = cursor
.selection()
.map(|s| self.text.to_coord(s.start)[1]..=self.text.to_coord(s.end)[1])
@ -824,11 +820,11 @@ impl Buffer {
.text
.chars()
.get(pos..)
.map_or(false, |l| l.starts_with(&['/', '/', ' ']))
.map_or(false, |l| l.starts_with(&comment_syntax))
{
self.remove(pos..pos + 3);
self.remove(pos..pos + comment_syntax.len());
} else {
self.insert(pos, "// ".chars());
self.insert(pos, comment_syntax.iter().copied());
}
}
}

View file

@ -284,10 +284,8 @@ impl Input {
let (fg, c) = match line.get(coord as usize).copied() {
Some('\n') if selected => (state.theme.whitespace, '⮠'),
Some(c) => {
if let Some(fg) = buffer
.highlights
.as_ref()
.and_then(|hl| hl.get_at(pos?))
if let Some(fg) = pos
.and_then(|pos| buffer.highlights.get_at(pos))
.map(|tok| state.theme.token_color(tok.kind))
{
(fg, c)