runjucks_core/
lexer.rs

1//! Tokenization: splits template source into [`Token`]s for [`crate::parser::parse`].
2//!
3//! Recognized regions:
4//! - `{#` … `#}` — comments (omitted from output).
5//! - `{%` / `{%-` … `%}` / `-%}` — statement tags as [`Token::Tag`] (inner body is whitespace-trimmed).
6//! - `{{` / `{{-` … `}}` / `-}}` — expressions as [`Token::Expression`] (inner spaces preserved unless trim markers strip them).
7//!
8//! **Whitespace control (Nunjucks-style):** `{%-` / `{{-` strip trailing whitespace from the preceding
9//! [`Token::Text`]; `-%}` / `-}}` strip leading whitespace from the following `Text`. Tag/variable
10//! bodies still trim inner whitespace when those markers are present (see variable handling below).
11//!
12//! Closing delimiters `%}` / `}}` are detected outside of double-quoted string literals (with `\`
13//! escapes), so delimiter-like sequences inside strings do not end the region early.
14//!
15//! `{% raw %}…{% endraw %}` and `{% verbatim %}…{% endverbatim %}` treat the middle as literal [`Token::Text`].
16
17use crate::errors::{Result, RunjucksError};
18
19/// Nunjucks-style delimiter customization (the `tags` key in `configure`).
20#[derive(Clone, Debug, PartialEq, Eq)]
21pub struct Tags {
22    pub block_start: String,
23    pub block_end: String,
24    pub variable_start: String,
25    pub variable_end: String,
26    pub comment_start: String,
27    pub comment_end: String,
28}
29
30impl Default for Tags {
31    fn default() -> Self {
32        Self {
33            block_start: "{%".into(),
34            block_end: "%}".into(),
35            variable_start: "{{".into(),
36            variable_end: "}}".into(),
37            comment_start: "{#".into(),
38            comment_end: "#}".into(),
39        }
40    }
41}
42
43/// Options controlling whitespace behavior and delimiters during lexing.
44///
45/// Mirrors the Nunjucks `trimBlocks`, `lstripBlocks`, and `tags` configuration keys.
46#[derive(Clone, Debug, Default)]
47pub struct LexerOptions {
48    /// When `true`, the first newline after a `{% … %}` tag is stripped.
49    pub trim_blocks: bool,
50    /// When `true`, leading whitespace and tabs on a line are stripped up to a `{% … %}` tag or `{# … #}` comment
51    /// (only when the tag/comment is the first non-whitespace on that line).
52    pub lstrip_blocks: bool,
53    /// Custom delimiter strings. `None` uses the Nunjucks defaults (`{%`, `%}`, `{{`, `}}`, `{#`, `#}`).
54    pub tags: Option<Tags>,
55}
56
57#[derive(Clone, Copy, Debug, PartialEq, Eq)]
58enum OpenKind {
59    Comment,
60    Tag { trim_open: bool },
61    Var { trim_open: bool },
62}
63
64fn next_opener(rest: &str, tags: &Tags) -> Option<(usize, OpenKind)> {
65    let bs = &tags.block_start;
66    let vs = &tags.variable_start;
67    let cs = &tags.comment_start;
68    let bs_trim = format!("{bs}-");
69
70    let vs_trim = format!("{vs}-");
71
72    let mut best: Option<(usize, OpenKind)> = None;
73    for (i, _) in rest.char_indices() {
74        let s = &rest[i..];
75        let candidate = if s.starts_with(cs.as_str()) {
76            Some((i, OpenKind::Comment))
77        } else if s.starts_with(bs_trim.as_str()) {
78            Some((i, OpenKind::Tag { trim_open: true }))
79        } else if s.starts_with(bs.as_str()) {
80            Some((i, OpenKind::Tag { trim_open: false }))
81        } else if s.starts_with(vs_trim.as_str()) {
82            Some((i, OpenKind::Var { trim_open: true }))
83        } else if s.starts_with(vs.as_str()) {
84            Some((i, OpenKind::Var { trim_open: false }))
85        } else {
86            None
87        };
88        if let Some((idx, kind)) = candidate {
89            best = match best {
90                None => Some((idx, kind)),
91                Some((bi, _)) if idx < bi => Some((idx, kind)),
92                Some(b) => Some(b),
93            };
94        }
95    }
96    best
97}
98
99fn parse_tag_prefix(rest: &str, tags: &Tags) -> Result<(String, usize, bool)> {
100    let bs = &tags.block_start;
101    let bs_trim = format!("{bs}-");
102    let open_len = if rest.starts_with(bs_trim.as_str()) {
103        bs_trim.len()
104    } else if rest.starts_with(bs.as_str()) {
105        bs.len()
106    } else {
107        return Err(RunjucksError::new(format!(
108            "internal lexer error: expected `{bs}`"
109        )));
110    };
111    let after_open = &rest[open_len..];
112    let (body_end, close_len) = find_tag_close(after_open, &tags.block_end)?;
113    let trim_close_marker = format!("-{}", tags.block_end);
114    let trim_close = after_open[body_end..].starts_with(trim_close_marker.as_str());
115    let body = after_open[..body_end].trim().to_string();
116    let total = open_len + body_end + close_len;
117    Ok((body, total, trim_close))
118}
119
120/// Finds the byte index of the `{%` that starts the **matching** closing tag, balancing nested
121/// `{% raw %}` / `{% endraw %}` (or `verbatim` / `endverbatim`) like Nunjucks `parseRaw`.
122///
123/// `rest` is the template suffix **after** the opening `{% raw %}` / `{% verbatim %}` tag was consumed.
124/// Nesting level starts at 1 (inside the outer block).
125fn find_matching_block_close(
126    rest: &str,
127    open_name: &str,
128    end_name: &str,
129    tags: &Tags,
130) -> Result<usize> {
131    let bs = &tags.block_start;
132    let bs_trim = format!("{bs}-");
133    let open_prefix = format!("{open_name} ");
134    let end_prefix = format!("{end_name} ");
135    let mut pos = 0usize;
136    let mut level = 1usize;
137    while pos < rest.len() {
138        let slice = &rest[pos..];
139        if !slice.starts_with(bs.as_str()) && !slice.starts_with(bs_trim.as_str()) {
140            let adv = slice.chars().next().map(|c| c.len_utf8()).unwrap_or(1);
141            pos += adv;
142            continue;
143        }
144        let tag_start = pos;
145        let (body, total, _) = match parse_tag_prefix(slice, tags) {
146            Ok(t) => t,
147            Err(_) => {
148                pos += slice.chars().next().map(|c| c.len_utf8()).unwrap_or(1);
149                continue;
150            }
151        };
152        if body.contains(bs.as_str()) {
153            pos += slice.chars().next().map(|c| c.len_utf8()).unwrap_or(1);
154            continue;
155        }
156        let is_open = body == open_name || body.starts_with(&open_prefix);
157        let is_close = body == end_name || body.starts_with(&end_prefix);
158        if is_open {
159            level += 1;
160        } else if is_close {
161            level = level.saturating_sub(1);
162            if level == 0 {
163                return Ok(tag_start);
164            }
165        }
166        pos = tag_start + total;
167    }
168    Err(RunjucksError::new(format!(
169        "unclosed {end_name} block: expected matching `{}` tag",
170        tags.block_end
171    )))
172}
173
174#[derive(Clone, Copy, PartialEq, Eq)]
175enum StringScan {
176    Code,
177    String,
178    StringEscape,
179}
180
181fn find_var_close(after_open: &str, tags: &Tags) -> Result<(usize, usize)> {
182    let ve = &tags.variable_end;
183    let vs = &tags.variable_start;
184    let trim_close = format!("-{ve}");
185    let mut state = StringScan::Code;
186    let mut i = 0usize;
187    while i < after_open.len() {
188        match state {
189            StringScan::StringEscape => {
190                let c = after_open[i..].chars().next().unwrap();
191                state = StringScan::String;
192                i += c.len_utf8();
193            }
194            StringScan::String => {
195                let rest = &after_open[i..];
196                let c = rest.chars().next().unwrap();
197                if c == '\\' {
198                    state = StringScan::StringEscape;
199                } else if c == '"' {
200                    state = StringScan::Code;
201                }
202                i += c.len_utf8();
203            }
204            StringScan::Code => {
205                let rest = &after_open[i..];
206                if rest.starts_with(trim_close.as_str()) {
207                    return Ok((i, trim_close.len()));
208                }
209                if rest.starts_with(ve.as_str()) {
210                    return Ok((i, ve.len()));
211                }
212                if rest.starts_with(vs.as_str()) {
213                    return Err(RunjucksError::new(format!(
214                        "nested `{vs}` inside a variable expression is not allowed"
215                    )));
216                }
217                if rest.starts_with('"') {
218                    state = StringScan::String;
219                    i += 1;
220                    continue;
221                }
222                let c = rest.chars().next().unwrap();
223                i += c.len_utf8();
224            }
225        }
226    }
227    Err(RunjucksError::new(format!(
228        "unclosed variable tag: expected `{ve}` or `-{ve}` after `{vs}`"
229    )))
230}
231
232fn find_tag_close(after_open: &str, block_end: &str) -> Result<(usize, usize)> {
233    let trim_close = format!("-{block_end}");
234    let mut state = StringScan::Code;
235    let mut i = 0usize;
236    while i < after_open.len() {
237        match state {
238            StringScan::StringEscape => {
239                let c = after_open[i..].chars().next().unwrap();
240                state = StringScan::String;
241                i += c.len_utf8();
242            }
243            StringScan::String => {
244                let rest = &after_open[i..];
245                let c = rest.chars().next().unwrap();
246                if c == '\\' {
247                    state = StringScan::StringEscape;
248                } else if c == '"' {
249                    state = StringScan::Code;
250                }
251                i += c.len_utf8();
252            }
253            StringScan::Code => {
254                let rest = &after_open[i..];
255                if rest.starts_with(trim_close.as_str()) {
256                    return Ok((i, trim_close.len()));
257                }
258                if rest.starts_with(block_end) {
259                    return Ok((i, block_end.len()));
260                }
261                if rest.starts_with('"') {
262                    state = StringScan::String;
263                    i += 1;
264                    continue;
265                }
266                let c = rest.chars().next().unwrap();
267                i += c.len_utf8();
268            }
269        }
270    }
271    Err(RunjucksError::new(format!(
272        "unclosed template tag: expected `{block_end}` or `-{block_end}` after block start"
273    )))
274}
275
276fn apply_var_trim(body: &str, trim_open: bool, trim_close: bool) -> String {
277    let mut s = body;
278    if trim_open {
279        s = s.trim_start();
280    }
281    if trim_close {
282        s = s.trim_end();
283    }
284    s.to_string()
285}
286
287/// One lexical unit from a template.
288#[derive(Debug, Clone, PartialEq, Eq)]
289pub enum Token {
290    Text(String),
291    Expression(String),
292    Tag(String),
293}
294
295#[derive(Clone, Copy, Debug, PartialEq, Eq)]
296enum LexerMode {
297    Normal,
298    Raw,
299    Verbatim,
300}
301
302/// Incremental lexer over a template string.
303#[derive(Debug, Clone)]
304pub struct Lexer<'a> {
305    input: &'a str,
306    position: usize,
307    mode: LexerMode,
308    pending: Option<Token>,
309    /// After `-%}` or `-}}`, strip leading whitespace from the next [`Token::Text`].
310    strip_leading_next_text: bool,
311    opts: LexerOptions,
312    tags: Tags,
313    /// After a `{% … %}` tag when `trim_blocks` is on, strip the first newline from the next text.
314    trim_block_newline: bool,
315}
316
317impl<'a> Lexer<'a> {
318    pub fn new(input: &'a str) -> Self {
319        Self::with_options(input, LexerOptions::default())
320    }
321
322    pub fn with_options(input: &'a str, opts: LexerOptions) -> Self {
323        let tags = opts.tags.clone().unwrap_or_default();
324        Self {
325            input,
326            position: 0,
327            mode: LexerMode::Normal,
328            pending: None,
329            strip_leading_next_text: false,
330            opts,
331            tags,
332            trim_block_newline: false,
333        }
334    }
335
336    #[inline]
337    pub fn rest(&self) -> &'a str {
338        self.input.get(self.position..).unwrap_or("")
339    }
340
341    #[inline]
342    pub fn is_eof(&self) -> bool {
343        self.position >= self.input.len()
344    }
345
346    fn skip_comment(&mut self) -> Result<()> {
347        let cs = &self.tags.comment_start;
348        let ce = &self.tags.comment_end;
349        let rest = self.rest();
350        if !rest.starts_with(cs.as_str()) {
351            return Err(RunjucksError::new(format!(
352                "internal lexer error: expected `{cs}`"
353            )));
354        }
355        let Some(end_rel) = rest.find(ce.as_str()) else {
356            return Err(RunjucksError::new(format!(
357                "unclosed comment: expected `{ce}` after `{cs}`"
358            )));
359        };
360        self.position += end_rel + ce.len();
361        Ok(())
362    }
363
364    fn consume_variable(&mut self, trim_open: bool) -> Result<Token> {
365        let vs = &self.tags.variable_start;
366        let vs_trim = format!("{vs}-");
367        let rest = self.rest();
368        let open_len = if rest.starts_with(vs_trim.as_str()) {
369            vs_trim.len()
370        } else {
371            vs.len()
372        };
373        self.position += open_len;
374        let after_open = self.rest();
375        let (body_end, close_len) = find_var_close(after_open, &self.tags)?;
376        let trim_close_marker = format!("-{}", self.tags.variable_end);
377        let trim_close = after_open[body_end..].starts_with(trim_close_marker.as_str());
378        let body = &after_open[..body_end];
379        let expr = apply_var_trim(body, trim_open, trim_close);
380        self.position += body_end + close_len;
381        if trim_close {
382            self.strip_leading_next_text = true;
383        }
384        Ok(Token::Expression(expr))
385    }
386
387    fn consume_tag_at_position(&mut self) -> Result<String> {
388        let rest = self.rest();
389        let (body, total, trim_close) = parse_tag_prefix(rest, &self.tags)?;
390        self.position += total;
391        if trim_close {
392            self.strip_leading_next_text = true;
393        } else if self.opts.trim_blocks {
394            self.trim_block_newline = true;
395        }
396        Ok(body)
397    }
398
399    fn end_tag_name(mode: LexerMode) -> &'static str {
400        match mode {
401            LexerMode::Raw => "endraw",
402            LexerMode::Verbatim => "endverbatim",
403            LexerMode::Normal => "",
404        }
405    }
406
407    fn open_tag_name(mode: LexerMode) -> &'static str {
408        match mode {
409            LexerMode::Raw => "raw",
410            LexerMode::Verbatim => "verbatim",
411            LexerMode::Normal => "",
412        }
413    }
414
415    fn next_token_block_mode(&mut self) -> Result<Option<Token>> {
416        let mode = self.mode;
417        let open_name = Self::open_tag_name(mode);
418        let end_name = Self::end_tag_name(mode);
419        let rest = self.rest();
420        let idx = find_matching_block_close(rest, open_name, end_name, &self.tags)?;
421        let mut literal = rest[..idx].to_string();
422        // Consume `trimBlocks` from the opening `{% raw %}` / `{% verbatim %}` tag so it does not
423        // apply to text that follows `{% endraw %}` / `{% endverbatim %}` (matches Nunjucks).
424        self.apply_leading_strip(&mut literal);
425        self.position += idx;
426        let rest2 = self.rest();
427        let (body, total, trim_close) = parse_tag_prefix(rest2, &self.tags)?;
428        self.position += total;
429        if trim_close {
430            self.strip_leading_next_text = true;
431        }
432        self.mode = LexerMode::Normal;
433        if !literal.is_empty() {
434            self.pending = Some(Token::Tag(body));
435            return Ok(Some(Token::Text(literal)));
436        }
437        Ok(Some(Token::Tag(body)))
438    }
439
440    /// Apply `trim_blocks` (strip leading `\n`) and `strip_leading_next_text` (`-%}` / `-}}`) to a text fragment.
441    fn apply_leading_strip(&mut self, text: &mut String) {
442        if self.strip_leading_next_text {
443            *text = text.trim_start().to_string();
444            self.strip_leading_next_text = false;
445            self.trim_block_newline = false;
446        } else if self.trim_block_newline {
447            if text.starts_with('\n') {
448                text.remove(0);
449            } else if text.starts_with("\r\n") {
450                text.drain(..2);
451            }
452            self.trim_block_newline = false;
453        }
454    }
455
456    /// When `lstrip_blocks` is enabled, strip trailing spaces/tabs that sit on the same line before a block tag or comment opener.
457    ///
458    /// Only strips when the opener is the first non-whitespace content on its line (i.e. only
459    /// horizontal whitespace appears between the preceding newline (or start of text) and the opener).
460    fn apply_lstrip_trailing(&self, text: &mut String, kind: OpenKind) {
461        if !self.opts.lstrip_blocks {
462            return;
463        }
464        let is_block = matches!(kind, OpenKind::Tag { .. } | OpenKind::Comment);
465        if !is_block {
466            return;
467        }
468        if let Some(nl) = text.rfind('\n') {
469            let after_nl = &text[nl + 1..];
470            if after_nl.chars().all(|c| c == ' ' || c == '\t') {
471                text.truncate(nl + 1);
472            }
473        } else if text.chars().all(|c| c == ' ' || c == '\t') {
474            text.clear();
475        }
476    }
477
478    fn next_token_normal(&mut self) -> Result<Option<Token>> {
479        loop {
480            if self.is_eof() {
481                return Ok(None);
482            }
483
484            let rest = self.rest();
485
486            match next_opener(rest, &self.tags) {
487                None => {
488                    let mut text = rest.to_owned();
489                    self.apply_leading_strip(&mut text);
490                    self.position = self.input.len();
491                    return Ok(Some(Token::Text(text)));
492                }
493                Some((0, OpenKind::Comment)) => {
494                    self.skip_comment()?;
495                    continue;
496                }
497                Some((0, OpenKind::Tag { .. })) => {
498                    let body = self.consume_tag_at_position()?;
499                    if body == "raw" || body.starts_with("raw ") {
500                        self.mode = LexerMode::Raw;
501                    } else if body == "verbatim" || body.starts_with("verbatim ") {
502                        self.mode = LexerMode::Verbatim;
503                    }
504                    return Ok(Some(Token::Tag(body)));
505                }
506                Some((0, OpenKind::Var { trim_open })) => {
507                    return self.consume_variable(trim_open).map(Some);
508                }
509                Some((idx, kind)) => {
510                    let mut text = rest[..idx].to_owned();
511                    self.apply_leading_strip(&mut text);
512                    let trim_open = matches!(
513                        kind,
514                        OpenKind::Tag { trim_open: true } | OpenKind::Var { trim_open: true }
515                    );
516                    if trim_open {
517                        text = text.trim_end().to_string();
518                    }
519                    self.apply_lstrip_trailing(&mut text, kind);
520                    self.position += idx;
521                    if text.is_empty() {
522                        continue;
523                    }
524                    return Ok(Some(Token::Text(text)));
525                }
526            }
527        }
528    }
529
530    pub fn next_token(&mut self) -> Result<Option<Token>> {
531        if let Some(t) = self.pending.take() {
532            return Ok(Some(t));
533        }
534        match self.mode {
535            LexerMode::Normal => self.next_token_normal(),
536            LexerMode::Raw | LexerMode::Verbatim => self.next_token_block_mode(),
537        }
538    }
539}
540
541/// Tokenizes the full `input` into a [`Vec`] of [`Token`]s.
542///
543/// An empty string yields a single [`Token::Text`] with empty content.
544///
545/// # Examples
546///
547/// ```
548/// use runjucks_core::lexer::{tokenize, Token};
549///
550/// let tokens = tokenize("Hi {{ name }}").unwrap();
551/// assert!(matches!(tokens[0], Token::Text(_)));
552/// assert!(matches!(tokens[1], Token::Expression(_)));
553/// ```
554pub fn tokenize(input: &str) -> Result<Vec<Token>> {
555    tokenize_with_options(input, LexerOptions::default())
556}
557
558/// Like [`tokenize`] but with explicit [`LexerOptions`].
559pub fn tokenize_with_options(input: &str, opts: LexerOptions) -> Result<Vec<Token>> {
560    if input.is_empty() {
561        return Ok(vec![Token::Text(String::new())]);
562    }
563    let mut lexer = Lexer::with_options(input, opts);
564    // Heuristic: fewer tokens for long plain-text runs; more when delimiter-heavy. Cap to avoid huge
565    // allocations on pathological inputs (full arena/interning is deferred until parse benches justify it).
566    let est = (input.len() / 24).saturating_add(4).min(8192);
567    let mut tokens = Vec::with_capacity(est);
568    while let Some(t) = lexer.next_token()? {
569        tokens.push(t);
570    }
571    Ok(tokens)
572}