-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.min.0497859f89d5ae4714dad856d467d3ab969f49aff10e71511e3a944bf3da006b3e221a97935e3e8e11464cf73ac864bb922d755bbc881f526c6705bc6c9a7490.js
507 lines (507 loc) · 135 KB
/
index.min.0497859f89d5ae4714dad856d467d3ab969f49aff10e71511e3a944bf3da006b3e221a97935e3e8e11464cf73ac864bb922d755bbc881f526c6705bc6c9a7490.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
var suggestions=document.getElementById("suggestions"),search=document.getElementById("search");search!==null&&document.addEventListener("keydown",inputFocus);function inputFocus(e){e.ctrlKey&&e.key==="/"&&(e.preventDefault(),search.focus()),e.key==="Escape"&&(search.blur(),suggestions.classList.add("d-none"))}document.addEventListener("click",function(e){var t=suggestions.contains(e.target);t||suggestions.classList.add("d-none")}),document.addEventListener("keydown",suggestionFocus);function suggestionFocus(e){const s=suggestions.classList.contains("d-none");if(s)return;const t=[...suggestions.querySelectorAll("a")];if(t.length===0)return;const n=t.indexOf(document.activeElement);if(e.key==="ArrowUp"){e.preventDefault();const s=n>0?n-1:0;t[s].focus()}else if(e.key==="ArrowDown"){e.preventDefault();const s=n+1<t.length?n+1:n;t[s].focus()}}(function(){var e=new FlexSearch.Document({tokenize:"forward",cache:100,document:{id:"id",store:["href","title","description"],index:["title","description","content"]}});e.add({id:0,href:"/docs/get-started/",title:"Get Started",description:"Find out what Pomsky is.",content:""}),e.add({id:1,href:"/docs/get-started/introduction/",title:"Introduction",description:"Summary of what Pomsky is and what it looks like",content:`What if regular expressions had been invented today, with software engineering best practices in mind? Enter Pomsky: A modern language that cross-compiles to regular expressions.
If you\u0026rsquo;re already familiar with regular expressions, check out the quick reference to get familiar with Pomsky.
The online playground is the ideal place to start experimenting with Pomsky.
To install the CLI, editor extension, or language integrations, follow the quick start guide.
`}),e.add({id:2,href:"/docs/get-started/quick-reference/",title:"Quick reference",description:"Quickly get familiar with the syntax",content:"Basics # Pomsky Explanation Regex \"string\" String string 'string' Raw string (no backslash escaping) string 'a' \u0026#x7c; 'b' a OR b a\u0026#x7c;b ('a' \u0026#x7c; 'b') Group (non-capturing) (?:a\u0026#x7c;b) # comment Line comment (?#comment) . any code point except \\n 1 . Repetitions # Pomsky Explanation Regex 'test'* 0 or more (?:test)* 'test'+ 1 or more (?:test)+ 'test'? 0 or 1 (?:test)? 'test'{4,} 4 or more times (?:test){4,} 'test'{4,7} 4 to 7 times (?:test){4,7} 'test'+ lazy Lazy (non-greedy) repetition (?:test)+? Variables #let x = 'foo' | 'bar'; x '-' x Variables are replaced with their content in the produced regex:\n(?:foo|bar)-(?:foo|bar) Character sets # Pomsky Explanation Regex [···] character set [···] ![···] negated character set [^···] [n t] special characters (line feed, tab) [\\n\\t] ['a' 'd'] an \u0026lsquo;a\u0026rsquo; or \u0026rsquo;d\u0026rsquo; [ad] ['ad'] an \u0026lsquo;a\u0026rsquo; or \u0026rsquo;d\u0026rsquo; [ad] ['a'-'d'] code points \u0026lsquo;a\u0026rsquo; through \u0026rsquo;d\u0026rsquo; [a-d] [U+45 U+FFEF] code points U+0045 and U+FFEF [\\x45\\uFFEF] Built-in character classes # Pomsky Explanation Regex [word], [w] any \u0026lsquo;word\u0026rsquo; code point (letter, digit, or _) \\w [Latin] any code point in the \u0026lsquo;Latin\u0026rsquo; script \\p{Latin} [Letter] any code point in the \u0026lsquo;Letter\u0026rsquo; category \\p{Letter} [ascii_digit] any ASCII digit [0-9] [!word] any code point except \u0026lsquo;word\u0026rsquo; code points \\W See all shorthand character classes and all supported Unicode properties.\nAnchors, boundaries, assertions #These don\u0026rsquo;t match one or more characters, but a position in the string.\nPomsky Explanation Regex ^ start of string 2 ^ $ end of string 2 $ % word boundary \\b !% not a word boundary \\B \u0026lt; start of a word \u0026gt; end of a word (\u003e\u003e 'x') lookahead assertion (?=x) (\u003c\u003c 'x') lookbehind assertion (?\u0026lt;=x) (!\u003e\u003e 'x') negated lookahead assertion (?!x) (!\u003c\u003c 'x') negated lookbehind assertion (?\u0026lt;!x) Capturing groups and references # Pomsky Explanation Regex :('foo') capturing group (foo) :bar('foo') capturing group named \u0026lsquo;bar\u0026rsquo; (?\u0026lt;bar\u003efoo) ::1 reference to 1st capturing group \\1 ::bar reference to group named \u0026lsquo;bar\u0026rsquo; \\k\u0026lt;bar\u003e ::-2 relative backreference \\k\u0026lt;-2\u003e ::+1 relative forward reference Wildcard patterns # Pomsky Explanation Regex . any code point except \\n 1 . Codepoint, C any code point [\\s\\S] Grapheme, G any grapheme \\X Testing #test { match 'foo'; match 'the', 'fox', 'the', 'dog' in 'the quick brown fox jumps over the lazy dog.'; reject 'lazy'; reject in 'lorem ipsum dolor'; } % [w]{3} % Comparing capturing groups:\ntest { match '13.1.4' as { major: '13', minor: '1', patch: '4' }; match '13.1.4' as { 1: '13', 2: '1', 3: '4' }; } :major([d]+) '.' :minor([d]+) '.' :patch([d]+) Modifiers # Pomsky Explanation Regex enable lazy; enable lazy repetition by default (?U) disable lazy; disable lazy repetition by default enable unicode; enable Unicode awareness (enabled by default) disable unicode; disable Unicode awareness Other # Pomsky Explanation Regex range '0'-'255' all decimal numbers from 0 to 255 range '0'-'1FF' base 16 all hex numbers from 0 to 1FF atomic('foo') atomic group (?\u003efoo) recursion recursively match the whole regex \\g\u003c0\u003e regex '[]acf-X]' inline regex []acf-X] Footnotes\nwith the single-line flag, . also matches line breaks.\u0026#160;\u0026#x21a9;\u0026#xfe0e;\u0026#160;\u0026#x21a9;\u0026#xfe0e;\nin multiline mode, these match the start or end of the line.\u0026#160;\u0026#x21a9;\u0026#xfe0e;\u0026#160;\u0026#x21a9;\u0026#xfe0e;\n"}),e.add({id:3,href:"/docs/get-started/quick-start/",title:"Quick Start",description:"Summary of how to start using Pomsky.",content:`Go to\u0026hellip;
Playground CLI JavaScript Rust
CLI #The CLI allows you to compile Pomsky expressions in the command line.
Pre-built binaries are available for Windows, Linux and macOS. Download them from the releases page.
Pomsky is also packaged for some package managers:
Install from source #This requires that a recent Rust toolchain is installed. Instructions for how to install Rust can be found here.
Install the CLI with
cargo install pomsky-bin Get help #To find out how to use the CLI, run
pomsky --help JavaScript plugin #Pomsky can be used with the npm module @pomsky-lang/unplugin. This is a compiler plugin, usable with Vite / Rollup / Webpack / ESBuild / ESM.
If you\u0026rsquo;re using Vite, add it to your config like this:
import { defineConfig } from 'vite' import pomsky from '@pomsky-lang/unplugin' export default defineConfig({ plugins: [pomsky()], }) Then you can import *.pom files from JavaScript/TypeScript:
import myRegex from './myRegex.pom' myRegex().test('does this string match?') or declare Pomsky expressions with the built-in pomsky$ macro:
const myRegex = pomsky$(\`% [word]{5} %\`) myRegex().test('does this string match?') Rust macro #If you want to write a Pomsky expression directly in your Rust source code, the pomsky-macro got you covered. Run this command:
cargo add pomsky-macro Then you can import and use it with
use pomsky_macro::pomsky; const MY_REGEX: \u0026amp;str = pomsky!([\u0026quot;great!\u0026quot;] | \u0026quot;great!\u0026quot;); Documentation can be found here.
`}),e.add({id:4,href:"/docs/get-started/enable-unicode/",title:"Enable Unicode Support",description:"Configure the RegExp engine to support Unicode.",content:`Pomsky has good Unicode support, but you might still have to enable Unicode support in your regex engine. This document explains how to do that for various regex engines.
If some information here is missing, outdated or needs clarification, I would greatly appreciate your help! You can edit this file on GitHub.
Rust #The Rust regex crate is Unicode-aware by default. There\u0026rsquo;s nothing you need to do.
JavaScript #In JavaScript, set the u flag, for example /[\\w\\s]/u. This makes it possible to use Unicode properties (\\p{...}) and code points outside of the BMP (\\u{...}).
Since \\w and \\d are not Unicode aware even when the u flag is enabled, Pomsky polyfills them. However, word boundaries aren\u0026rsquo;t Unicode aware, so you need to disable Unicode to use them or use lookarounds.
disable unicode; \u0026lt;'test'\u0026gt; If you need Unicode-aware word boundaries, you can use the following instead of the \u003c and \u003e word boundaries:
let wstart = (!\u0026lt;\u0026lt; [w]) (\u0026gt;\u0026gt; [w]); # start of a word let wend = (\u0026lt;\u0026lt; [w]) (!\u0026gt;\u0026gt; [w]); # end of a word PHP #PHP is Unicode-aware if the u flag is set, and this also applies to \\w, \\d, \\s and \\b. For example, '/\\w+/u' matches a word in any script.
Java, Kotlin, Scala #Add (?U) in front of the regex to make it Unicode-aware. For example, \u0026quot;(?U)\\\\w+\u0026quot; matches a word in any script.
Ruby #In Ruby, add (?u) in front of the regex to make it Unicode-aware. For example, /(?u)\\w+/ matches a word in any script.
Python #In the Python re module, \\w, \\d, \\s and \\b are Unicode-aware since Python 3.
If you\u0026rsquo;re still using Python 2, you can use the regex module from November 2021; releases newer than that don\u0026rsquo;t support Python 2.
Elixir #Regexes in Elixir are Unicode-aware if the u flag is added. For example, ~r/\\w+/u matches a word in any script.
Erlang #You need to set the unicode and ucp options to make regexes Unicode aware. For example, re:compile(\u0026quot;\\\\w+\u0026quot;, [unicode, ucp]) matches a word in any script.
PCRE #PCRE supports Unicode, but to make \\w, \\d, \\s and \\b Unicode-aware, you need to enable both PCRE_UTF8 and PCRE_UCP.
`}),e.add({id:5,href:"/docs/language-tour/",title:"Language Tour",description:"Learn how to use Pomsky.",content:""}),e.add({id:6,href:"/docs/language-tour/strings/",title:"Strings",description:"Introduction to matching text",content:`First, let\u0026rsquo;s get familiar with the basic building blocks of the language.
Pomsky expressions describe the syntactical structure of a text. There are several kinds of expressions. The most important kind is the string:
'test' This is an expression matching the text test. Note that strings are always wrapped in quotes. This is how we can distinguish strings from other kinds of expressions!
Multiple strings can be concatenated by writing them in succession:
'foo' 'bar' This matches the text foobar. Spaces between the strings are ignored, as are line breaks: Pomsky is whitespace-insensitive. Whitespace is what we call all invisible characters, such as spaces and line breaks. However, whitespace is only ignored outside of strings:
'spaces and this line break are not ignored!' Comments #So far, the expressions have been very simple, but this will change in the following chapters. When writing more complex expressions, it can be important to explain what something is doing, so a reader can understand it. This is what comments are for:
# this is a comment # comments are ignored by Pomsky! Comments start with a # and go until the end of the line. Comments are ignored by Pomsky, they\u0026rsquo;re meant only for you, the reader. You can add as much useful information in comments as you want!
String quotes #We can use double quotes ("") or single quotes ('') for strings. Most of the time it doesn\u0026rsquo;t matter which quotes you use, with a few exceptions:
Strings delimited with '' can\u0026rsquo;t contain single quotes. To match the text Spiders', use double quotes:
\u0026quot;Spiders'\u0026quot; Likewise, use '' whenever a string contains double quotes. But what if a string contains both? One possible solution is to concatenate multiple strings:
'The restaurant is called \u0026quot;Spiders' \u0026quot;'\u0026quot; '\u0026quot;.' Here are three strings, together matching the text The restaurant is called \u0026quot;Spiders'\u0026quot;.. If you don\u0026rsquo;t like this approach, there is another solution: Double quoted strings allow escaping with a backslash (\\):
\u0026quot;The restaurant is called \\\u0026quot;Spiders'\\\u0026quot;.\u0026quot; A backslash escapes the next character, robbing it of its special meaning. This means \\\u0026quot; is treated as a \u0026quot; character, and not as the closing quote of the string. However, in double quoted strings, backslashes must be escaped as well, so when matching a Windows file path like C:\\User\\John Doe\\Documents\\Thesis.pdf, better use single quotes.
`}),e.add({id:7,href:"/docs/language-tour/alternations/",title:"Alternations",description:"Match one of multiple alternatives",content:`What if we want to match multiple strings? Say, we want to match the texts one, two, three, four, and five:
'one' | 'two' | 'three' | 'four' | 'five' It\u0026rsquo;s that easy, just separate all alternatives with a vertical bar. This is called an alternation. The | can be read as \u0026ldquo;or\u0026rdquo;, since the above matches 'one' or 'two' or 'three' or 'four' or 'five'.
Grouping #If we want to concatenate an alternation, we need to wrap it in parentheses:
('blue' | 'yellow' | 'green') 'ish' This matches blueish, yellowish, and greenish. Every expression can be surrounded by parentheses, this is called a group. Here the parentheses are needed to clarify that the ish is concatenated with the entire alternation, not just the green part.
Formatting #When your expression gets so long that it doesn\u0026rsquo;t fit in a single line, it looks better to put every alternative in a separate line:
'one' | 'two' | 'three' | 'four' | 'five' But this looks odd, because the first line is not aligned with the others. So Pomsky allows you to add a leading vertical bar:
| 'one' | 'two' | 'three' | 'four' | 'five' `}),e.add({id:8,href:"/docs/language-tour/repetitions/",title:"Repetitions",description:"How to repeat an expression, greedily or lazily",content:`When we want to match an expression multiple times, it would be cumbersome to repeat our expression. Instead, we can specify how often the expression should occur:
('r' | 'w' | 'x' | '-'){9} This matches an r, w, x or - character 9 times. For example, it would match the string rwxr-xr--, or xxrr-xr-w.
What if we want to match strings of different lengths? Repetitions are quite flexible, so we can specify a lower and upper bound for the number of repetitions:
('r' | 'w' | 'x' | '-'){3,9} This matches between 3 and 9 characters. It\u0026rsquo;s also possible to omit the upper bound:
'la'{1,} This matches the text la at least 1 time. So it matches la, but also lala, lalala, lalalala, and so on.
Abbreviations #Wanting to match someting at least once is so common that there\u0026rsquo;s a special syntax for it: The +.
'la'+ And there are two more special cases: {0,} (repeated zero or more times) can be written as *, and {0,1} can be written as ?.
This leaves us with the following options:
Expression Meaning 'la'{5,9} Between 5 and 9 las 'la'{5,} At least 5 las 'la'* Any number of las (including 0) 'la'+ At least 1 la 'la'? Maybe a la, maybe not Matching behavior #👉 This section explains the behavior of typical regex engines. It is a bit more technical than the rest of this tour, but it is important to understand. Pomsky expressions are often used to search for substrings in a text matching a particular pattern. For this, a regex engine is used; the Pomsky expression is first transformed into a regex (short for \u0026ldquo;regular expression\u0026rdquo;) by the Pomsky compiler, and then given to a regex engine. The regex engine then performs the search by walking over the text, until it finds a substring matching the regex.
But when the expression is repeated, how often should the regex engine attempt to repeat it? For example, 'la'{2,4} could repeat 2, 3 or 4 times. When searching the text My favourite song goes like lalala la, should it stop as soon as it found lala, or should it check if there is a third and fourth la?
By default, regex engines are greedy: They try to repeat an expression as often as possible. Only if that fails will they check if the expression matches with fewer repetitions. So in the above example, the regex engine will give you the match lalala. Since it is followed by a space, it can\u0026rsquo;t match a fourth time.
It gets more interesting when a repetition is followed by another expression:
'la'{2,4} 'li' Let\u0026rsquo;s see what happens when searching the string lalalalalali for this pattern: The regex engine first detects the string la at the very start.
lalalalalali ^ It greedily attempts to repeat it 4 times, and succeeds. Now it is at the 9th character:
lalalalalali ^ Now it attempts to match the 'li' part, but fails: There is no li at the current position. So the regex engine gives up the last repetition and tries again:
lalalalalali ^ This is called backtracking; think of it like wandering through a maze and trying out every path. Whenever we reach a dead end, we return to the previous junction and try the next path.
Unfortunately, the 'li' part doesn\u0026rsquo;t match after the third repetition either, or the second one. Now the regex engine has no more paths to explore, and gives up.
But it isn\u0026rsquo;t finished yet, maybe there is a match somewhere else in the string! So it returns to the start:
lalalalalali ^ Since the regex engine already tried at this position and failed, it skips to the next occurrence of the substring la, which at the 3rd character:
lalalalalali ^ Again, it tries to greedily repeat it four times, and succeeds!
lalalalalali ^ The next step is matching the 'li' part, which succeeds, and the regex engine is done. The matched substring is:
lalalalalali ^^^^^^^^^^ Not all regex engines use backtracking; a notable exception is Rust\u0026rsquo;s regex library, which can convert an expression to a lazy deterministic finite automaton (lazy DFA), a special kind of state machine that never needs to backtrack.
However, \u0026ldquo;backtracking\u0026rdquo; is still a good mental model to understand what a regex engine does. Even though Rust\u0026rsquo;s regex library never backtracks, it always returns the same matches as a backtracking regex engine would. It might just do it faster.
`}),e.add({id:9,href:"/docs/language-tour/dots/",title:"Dots",description:"Matching an arbitrary codepoint",content:`You can use the dot (.) to match any character, except line breaks. For example:
... # 3 characters Most regex engines have a \u0026ldquo;singleline\u0026rdquo; option that changes the behavior of .. When enabled, . matches everything, even line breaks. You could use this to check if a text fits in an SMS:
.{1,160} # enforces the 160 character limit If you want to match any character, without having to enable the \u0026ldquo;singleline\u0026rdquo; option, Pomsky also offers the variable C, or Codepoint:
Codepoint{1,160} What\u0026rsquo;s a codepoint? #I lied when I said that the dot matches a character; it actually matches a Unicode codepoint.
A Unicode codepoint usually, but not always, represents a character. Exceptions are composite characters like ć (which may consist of a ´ and a c when it isn\u0026rsquo;t normalized). Composite characters are common in many scripts, including Japanese, Indian and Arabic scripts. Also, an emoji can consist of multiple codepoints, e.g. when it has a gender or skin tone modifier.
Repeating the dot #Be careful when repeating C or .. My personal recommendation is to never repeat them. Let\u0026rsquo;s see why:
'{' .* '}' This matches any content surrounded by curly braces. Why is this bad? Because .* will greedily consume anything, even curly braces, so looking for matches in the string {ab} de {fg} will return the whole string, but we probably expected to get the two matches {ab} and {fg}.
We\u0026rsquo;ll see how this can be fixed in a bit.
`}),e.add({id:10,href:"/docs/language-tour/character-sets/",title:"Character Sets",description:"Matching a codepoint with certain properties",content:`What if we want to match an arbitrary word? Enumerating every single word is obviously not feasible, so what to do instead? We can enumerate all letters and repeat them:
( | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' | 'n' | 'o' | 'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z' )+ But this is very verbose and still only matches lowercase letters. We programmers tend to be lazy, so there must be a more convenient solution!
Character ranges #This expression matches words that can contain English lowercase and uppercase letters:
['a'-'z' 'A'-'Z']+ The square brackets indicate that this is a character set. A character set always matches exactly 1 character (more precisely, a Unicode codepoint). This character set contains two ranges, one for lowercase letters and one for uppercase letters. Together, this matches any character that is either an English lowercase or uppercase letter.
It\u0026rsquo;s also possible to add single characters to the set, for example:
['$' '_' 'a'-'z' 'A'-'Z'] Multiple characters can be put in the same quotes:
['$_' 'a'-'z' 'A'-'Z'] This is equivalent to ('$' | '_' | ['a'-'z' 'A'-'Z']), but it\u0026rsquo;s shorter.
Character ranges and Unicode #👉 You can skip this section if you are already familiar with Unicode. What is a range, exactly? Let\u0026rsquo;s see with an example:
['0'-'z'] This doesn\u0026rsquo;t seem to make sense, but it works. If you try it out, you\u0026rsquo;ll notice that it matches numbers, lowercase and uppercase letters. However, it also matches a few other characters, e.g. the question mark ?.
The reason is that Pomsky uses Unicode, a standard that assigns every character a numeric value. When we write '0'-'z', Pomsky assumes that we want to match any character whose numeric value is somewhere between the value of '0' and the value of 'z'. This works well for letters (e.g. 'a'-'z') and numbers ('0'-'9'), because these have consecutive values in Unicode. However, there are some special characters between digits, uppercase letters and lowercase letters:
Character Unicode value '0' 48 '1' 49 '2' 50 \u0026hellip; '9' 57 ':' 58 ';' 59 '\u0026lt;' 60 '=' 61 '\u0026gt;' 62 '?' 63 '@' 64 'A' 65 'B' 66 \u0026hellip; 'Z' 90 '[' 91 '\\' 92 ']' 93 '^' 94 '_' 95 '\`' 96 'a' 97 \u0026hellip; 'z' 122 Why, you might ask? This is for historical reasons.
Unicode properties #The reason why Unicode was invented is that most people in the world don\u0026rsquo;t speak English, and many of them use languages with different alphabets. To support them, Unicode includes 149,813 codepoints covering 161 different scripts. Since we have a widely supported standard for supporting different languages, let\u0026rsquo;s use it!
The character class ['a'-'z' 'A'-'Z'] only recognizes Latin characters. What should we do instead? We should use a general category. In this case, Letter seems like a good choice. Pomsky makes it easy to use Unicode categories:
[Letter] That\u0026rsquo;s it. This matches any letter from all 161 scripts! It\u0026rsquo;s also possible to match any codepoint in a certain script:
[Cyrillic Hebrew] This matches a Cyrillic or Hebrew codepoint.
Some regex engines can also match Unicode properties other than categories and scripts. Useful properties include
Alpha (letters and marks that can appear in a word) Upper, Lower (uppercase or lowercase letters) Emoji Math (mathematical symbols) You can see the full list of Unicode properties here.
Negation #Character classes are negated by putting a ! in front of it. For example, !['a'-'f'] matches anything except a letter between a and f.
It\u0026rsquo;s also possible to negate Unicode properties individually. For example, [Latin !Alpha] matches a codepoint that is either in the Latin script, or is not alphabetic.
Remember the example from the previous page? We repeated the dot to match matching curly braces:
'{' .* '}' But it didn\u0026rsquo;t work correctly because the dot is greedily repeated, so it can consume curly braces:
{foo} {bar} ^^^^^^^^^^ We can fix this by using a character class that doesn\u0026rsquo;t match curly braces:
'{' !['{}']* '}' `}),e.add({id:11,href:"/docs/language-tour/shorthands/",title:"Shorthands",description:"Character class shorthands",content:`There are abbreviations, called shorthands, for often needed character sets:
[digit] or [d] matches a decimal number. It is similar to ['0'-'9'], except that it is Unicode aware.
[word] or [w] matches a word character, i.e. a letter, digit or underscore. It\u0026rsquo;s similar to ['0'-'9' 'a'-'z' 'A'-'Z' '_'], except that it is Unicode aware. It matches all codepoints in the Alphabetic, Mark, Decimal_Number, Connector_Punctuation, and Join_Control Unicode categories.
[space] or [s] matches whitespace. It is equivalent to the White_Space category.
[horiz_space] or [h] matches horizontal whitespace, e.g. tabs und spaces.
[vert_space] or [v] matches vertical whitespace, e.g. line breaks.
These can be combined as well:
[d s '.'] # match digits, spaces, and dots Note that word, digit and space only match ASCII characters, if the regex engine isn\u0026rsquo;t configured to be Unicode-aware. How to enable Unicode support is described here.
What if I don\u0026rsquo;t need Unicode? #You don\u0026rsquo;t have to use Unicode-aware character sets such as [digit] if you know that the input is only ASCII. Unicode-aware matching can be considerably slower. For example, the [word] character class includes more than 100,000 code points, so matching a [ascii_word] (which includes only 63 code points) is faster.
Pomsky supports a number of ASCII-only shorthands:
Character class Equivalent [ascii] [U+00-U+7F] [ascii_alpha] ['a'-'z' 'A'-'Z'] [ascii_alnum] ['0'-'9' 'a'-'z' 'A'-'Z'] [ascii_blank] [' ' U+09], [ascii_cntrl] [U+00-U+1F U+7F] [ascii_digit] ['0'-'9'] [ascii_graph] ['!'-'~'] [ascii_lower] ['a'-'z'] [ascii_print] [' '-'~'] [ascii_punct] ['!'-'/' ':'-'@' '['-'\`' '{'-'~'] [ascii_space] [' ' U+09-U+0D] [ascii_upper] ['A'-'Z'] [ascii_word] ['0'-'9' 'a'-'z' 'A'-'Z' '_'] [ascii_xdigit] ['0'-'9' 'a'-'f' 'A'-'F'] Using them can improve performance, but be careful when you use them. If you aren\u0026rsquo;t sure if the input will ever contain non-ASCII characters, it\u0026rsquo;s better to err on the side of correctness, and use Unicode-aware character classes.
Non-printable characters #Characters that can\u0026rsquo;t be printed should be replaced with their hexadecimal Unicode code point. For example, you may write U+FEFF to match the Zero Width No-Break Space.
There are also 6 non-printable characters with a name:
[n] matches the \\n line feed. [r] matches the \\r carriage return. [f] matches the \\f form feed. [a] matches the \u0026ldquo;alert\u0026rdquo; or \u0026ldquo;bell\u0026rdquo; control character. [e] matches the \u0026ldquo;escape\u0026rdquo; control character. Other characters have to be written in their hexadecimal form:
[U+10-U+30 U+FEFF] Note that you don\u0026rsquo;t need to write leading zeroes, i.e. U+0 is just as ok as U+0000. However, it is conventional to write ASCII characters with two digits and non-ASCII characters with 4, 5 or 6 digits depending on their length.
`}),e.add({id:12,href:"/docs/language-tour/anchors/",title:"Anchors",description:"Matching the start/end of the string",content:`Anchors match the start or end of the text. ^ matches the start, and $ matches the end. Anchors are important because regex engines typically match substrings of the text, but sometimes you want the entire text to match.
For example: Let\u0026rsquo;s say a user entered a phone number, and you want to check if it is valid. You use the following expression:
'+'? [ascii_digit '-()/ ']+ But this also finds a match in texts that aren\u0026rsquo;t valid phone numbers:
agt4578409tuirüzojhüziou54x ^^^^^^^ ^^ match 1 match 2 To make sure the entire text has to match, we can add ^ and $ anchors:
^ '+'? [ascii_digit '-()/ ']+ $ The ^ ensures the match is at the start of the text, and the $ ensures that the end of the match is also the end of the search text.
Anchors can appear anywhere, to implement more complicated logic. For example:
('a' | ^) 'b' This matches either ab or just b. But if there is no a, the match must be at the start of the text.
`}),e.add({id:13,href:"/docs/language-tour/word-boundaries/",title:"Word Boundaries",description:"Matching the start/end of a word",content:`Word boundaries match a position where a word starts or ends. Like anchors, they do not consume any characters \u0026ndash; they have a length of 0. Expressions like this are called assertions.
There are three kinds of word boundaries:
\u003c to match at the start of a word \u003e to match at the end of a word % to match either at the start or at the end of a word. For example, if you want to find occurrences of the word test, but do not want to match substrings in words like testament or detests, you need to add word boundaries:
\u0026lt;'test'\u0026gt; To match multiple words, wrap an alternation in a group:
\u0026lt;('if' | 'else' | 'for' | 'while')\u0026gt; What is a word boundary? #A word start boundary is a position followed, but not preceded by a word character. Likewise, a word end boundary is position preceded, but not followed by a word character.
\u0026ldquo;Word characters\u0026rdquo; include letters, digits, and underscores. Formally, word characters are the set of the following Unicode properties:
Alphabetic Mark Decimal_Number Connector_Punctuation You can match a word character with the [word] character set.
Note that word boundaries aren\u0026rsquo;t 100% accurate: For example, the word can't has 4 word boundaries: At the start, the end, and around the apostrophe. Some scripts (e.g. Chinese) don\u0026rsquo;t separate words by spaces, so no word boundaries can be detected.
Negation #The % word boundary can be negated as !%. This matches inside or outside of a word, but not at a word boundary.
Note about JavaScript #In JavaScript, word boundaries are never Unicode-aware, even when the u flag is set. That\u0026rsquo;s why Unicode must be disabled to use them:
disable unicode; \u0026lt;'test'\u0026gt; If you need Unicode-aware word boundaries, you can use the following variables instead of the \u003c and \u003e word boundaries:
let wstart = (!\u0026lt;\u0026lt; [w]) (\u0026gt;\u0026gt; [w]); # start of a word let wend = (\u0026lt;\u0026lt; [w]) (!\u0026gt;\u0026gt; [w]); # end of a word wstart 'test' wend `}),e.add({id:14,href:"/docs/language-tour/modifiers/",title:"Modifiers",description:"Change how a sub-expression is treated",content:`Modifiers allow you to change the behavior of a Pomsky expression. Modifiers are statements; they can appear either at the top of the file, or inside a group:
disable unicode; [word]+ (enable unicode; '.' [word]+) Modifiers must appear before the expression they modify. They consist of two parts: The enable or disable keyword, and a mode, followed by a ;.
There are currently two modes that can be enabled or disabled:
Unicode mode #Unicode is enabled by default; disable it with disable unicode;.
When Unicode mode is disabled, shorthands like [word] no longer recognize Unicode, only ASCII. Unicode properties like [Letter] or [Emoji] are forbidden when Unicode is disabled.
Unicode mode also affects word boundaries: When disabled, only the ASCII characters a-z, A-Z, 0-9 and underscore _ are treated as word characters. This means that the word Königsstraße has word boundaries around the ö and ß, because they are not in the ASCII character set.
Lazy mode #The lazy mode is enabled with enable lazy;. It has the effect that repetition (which is usually greedy) becomes lazy: The regex engine will then try to repeat the expression as few times as possible.
For example, the expression 'la'+ will always match exactly one la in lazy mode, even when the search string is lalalala, because the regex engine stops searching as soon as it found the first la.
Lazy mode is a solution to the problem that occurs when the dot is repeated:
enable lazy; '{' .* '}' Without lazy mode, this greedily consumes as many characters as possible. So if the string {foo} bar {baz} should contain two matches, lazy mode is required. However, it is usually better to make the repetition more specific:
'{' !['{}']* '}' This is more performant because it avoids backtracking, and it is unambiguous.
Note that laziness and greediness can also be set individually for each repetition:
.* lazy # make only this repetition lazy .* greedy # make only this repetition greedy `}),e.add({id:15,href:"/docs/language-tour/capturing-groups/",title:"Capturing Groups",description:"Capture group contents for search \u0026 replace",content:"As we have seen before, parentheses can be used to group expressions together. Capturing groups are a special kind of group that capture their matched text. This allows extracting information from matches later.\nHere is an example where a Pomsky expression is used to match a semantic version number:\n# a semver version number, e.g. '1.3.17' :([digit]+) '.' :([digit]+) '.' :([digit]+) The : in front of the groups turns them into capturing groups. This means that when matching the string 1.3.17, the regex engine will create 3 captures containing the substrings 1, 3, and 17.\nHere is how we could use them in JavaScript:\nimport versionRegex from './versionRegex.pom' function createVersionTag(version) { return version.replace(versionRegex(), 'v$1_$2_$3') } The import statement imports and compiles the Pomsky expression, which is expected to be in another file, versionRegex.pom.\n⚠️ This requires a bundler with the @pomsky-lang/unplugin module. The replace function accepts two arguments: The compiled Pomsky expression, and a substitution string. This string contains some placeholders, $1, $2, and $3, which are substituted with the 1st, 2nd, and 3rd capturing group. So when the function is called with '1.3.17', it will return v1_3_17.\nNamed capturing groups #You can give capturing groups a name:\n:major([digit]+) '.' :minor([digit]+) '.' :patch([digit]+) This is good practice, as you no longer need to count capturing groups and can simply refer to them by their name:\nimport versionRegex from './versionRegex.pom' function createVersionTag(version) { const { major, minor, patch } = versionRegex().exec(version).groups return `v${major}_${minor}_${patch}` } "}),e.add({id:16,href:"/docs/language-tour/variables/",title:"Variables",description:"Refactoring expressions so you Don't Repeat Yourself",content:`Variables are a powerful feature that is exclusive to Pomsky; because no regex engine offers this functionality, variables in Pomsky are \u0026ldquo;inlined\u0026rdquo;, i.e. substituted with their value recursively.
Variables are declared with the let keyword:
let x = 'hello' | 'world'; The above will emit nothing, because the variable is declared, but not used. It could be used like this:
let x = 'hello' | 'world'; x '!' This compiles to
(?:hello|world)! Because variables are inlined, they do not allow recursion, otherwise the generated regular expression would have infinite size. But even without recursion, variables are a powerful and useful tool to create more complex expressions. Variables also serve as documentation: Their names tell the reader what the corresponding expression should match. Of course, this requires that you use descriptive variable names.
There can be multiple variable declarations. They can appear in any order, but the Pomsky expression using the variables must come last. For example, this is not allowed:
# doesn't work! x '!' let x = 'hello' | 'world'; Declarations can depend on each other, as long as there is no cyclic dependency:
let c = 'test'; let a = b b; let b = c '!'; a Here, a depends on b, which depends on c. But c cannot depend on a, as this would lead to a cyclic dependency.
Nesting #Variable declarations are statements, like modifiers. They can also be nested within a group; in that case, they are only usable within this group:
let name = 'Max'; ( let greeting = 'Hello'; greeting ', ' name ) greeting # error! Here, greeting can\u0026rsquo;t be used in the last line because it is only accessible within the group where it was declared.
Keep your code DRY #\u0026ldquo;DRY\u0026rdquo; stands for \u0026ldquo;Don\u0026rsquo;t Repeat Yourself\u0026rdquo;, and is an important principle in software engineering. It means that when you have the same code in multiple places, you should move it to its own function to avoid duplication.
Why is duplication bad? Because whenever you have to change it, you need to apply the same change in multiple places, and forgetting to update a place can easily cause bugs. With a lot of duplication, your code also becomes messy and unreadable.
Pomsky makes it easy to keep your expressions DRY: Whenever there is duplication, you can put it in a variable. For example, remember the example from the previous page:
:([digit]+) '.' :([digit]+) '.' :([digit]+) This isn\u0026rsquo;t particularly bad, but we can still improve it with a variable:
let number = [digit]+; :(number) '.' :(number) '.' :(number) Note that capturing groups cannot appear in a variable declaration. This is because capturing groups must be unique.
Negation #You can negate variables, but only if the expression they are replaced with can be negated. For example:
let hex = ['0'-'9' 'a'-'f' 'A'-'F']; let non_hex = !hex; `}),e.add({id:17,href:"/docs/language-tour/lookaround/",title:"Lookaround",description:"Matching forwards or backwards without consuming characters",content:`Lookarounds allow you to see if the characters before or after the current position match a certain expression. Lookarounds are assertions, meaning that they have a length of 0, much like anchors and word boundaries.
Example #Let\u0026rsquo;s say we want to match all keys in a JSON file. JSON is a simple, structured text format, e.g.
{ \u0026quot;languages\u0026quot;: [ { \u0026quot;name\u0026quot;: \u0026quot;Pomsky\u0026quot;, \u0026quot;proficiency\u0026quot;: \u0026quot;expert\u0026quot;, \u0026quot;open_source\u0026quot;: true } ] } To match all keys, we need to look for strings followed by a :. However, we don\u0026rsquo;t want the colon to be part of our match; we just want to check that it\u0026rsquo;s there! Here\u0026rsquo;s a possible solution:
'\u0026quot;' !['\u0026quot;']* '\u0026quot;' (\u0026gt;\u0026gt; [space]* ':') The \u003e\u003e introduces a lookahead assertion. It checks that the \u0026quot; is followed by a :, possibly with spaces in between. The contents of the lookahead are not included in the match.
But what if there\u0026rsquo;s a key containing escaped quotes, e.g. \u0026quot;foo \\\u0026quot;bar\\\u0026quot; baz\u0026quot;? To handle this, we need to allow escape sequences in the string:
'\u0026quot;' (!['\\\u0026quot;'] | '\\\\' | '\\\u0026quot;')* '\u0026quot;' (\u0026gt;\u0026gt; [space]* ':') There\u0026rsquo;s just one piece missing: The first quote should not be preceded by a backslash, so we need another assertion:
(!\u0026lt;\u0026lt; '\\') '\u0026quot;' (!['\\\u0026quot;'] | '\\\\' | '\\\u0026quot;')* '\u0026quot;' (\u0026gt;\u0026gt; [space]* ':') This is a negative lookbehind assertion. It asserts that the string is not preceded by the contained expression.
In total, there are 4 kinds of lookaround assertions:
\u003e\u003e (positive lookahead) \u003c\u003c (positive lookbehind) !\u003e\u003e (negative lookahead) !\u003c\u003c (negative lookbehind) Note that lookbehind isn\u0026rsquo;t supported everywhere. Rust supports neither lookbehind nor lookahead.
Intersection expressions #Lookaround makes it possible to simultaneously match a string in multiple ways. For example:
\u0026lt; (!\u0026gt;\u0026gt; ('_' | 'for' | 'while' | 'if') \u0026gt;) [word]+ \u0026gt; This matches a string consisting of word characters, but not one of the keywords _, for, while and if.
Be careful when using this technique, because the lookahead might not match the same length as the expression after it. Here, we ensured that both match until the word end with \u003e.
`}),e.add({id:18,href:"/docs/language-tour/ranges/",title:"Ranges",description:"Matching a number in a certain range",content:`When you need to match a range of numbers, the range syntax is your best friend. Character ranges (e.g. ['0'-'7']) are only able to match a single digit; the range syntax has no such limitation:
let octet = range '0'-'255'; # ipv4 address octet ('.' octet){3} This generates a regular expression that is both correct and as efficient as possible, since it never requires backtracking. If you\u0026rsquo;re curious, here\u0026rsquo;s the regex the range '0'-'255' compiles to:
0|1[0-9]{0,2}|2(?:[0-4][0-9]?|5[0-5]?|[6-9])?|[3-9][0-9]? Different bases #Pomsky can generate ranges in various bases. For example, to match hexadecimal numbers in a certain range, you might write:
range '10F'-'FFFF' base 16 Leading zeroes # If you wish to support leading zeros, this is easy to achieve by putting '0'* in front:
'0'* range '0'-'1024' If the number should have a certain length, with leading zeroes added when necessary, Pomsky has a special syntax for this:
range '0000'-'1024' This matches numbers in the specified range with exactly 4 digits, such as 0110 or 0026.
`}),e.add({id:19,href:"/docs/language-tour/references/",title:"References",description:"Matching the same thing more than once",content:`Sometimes it\u0026rsquo;s useful to match the same text as we matched before. For example, let\u0026rsquo;s try to match Rust\u0026rsquo;s raw strings, which look like r#\u0026quot;...\u0026quot;# or r###\u0026quot;...\u0026quot;###. They can have an arbitrary number of # characters, but must have the same number of #s at each end:
'r' :('#'*) '\u0026quot;' C* lazy '\u0026quot;' ::1 There are three important parts: First, there\u0026rsquo;s a capturing group matching any number of #s. We then match an arbitrary number of characters surrounded by quotes. Finally, there\u0026rsquo;s a ::1 reference. This matches the same text as was captured in capturing group 1. In other words, if the string started with ##, it also has to end with ##.
Another application is XML tags:
'\u0026lt;' :([word]+) '\u0026gt;' !['\u0026lt;']* '\u0026lt;/' ::1 '\u0026gt;' This is by no means a complete XML parser, but it recognizes an XML tag (without attributes) that doesn\u0026rsquo;t contain other XML tags. For example, it correctly matches \u0026lt;span\u0026gt;Hello world\u0026lt;/span\u0026gt;. With a backreference, it ensures that the closing tag is the same as the opening tag.
Pomsky has three kinds of references:
Numeric references, e.g. ::3, match a capturing group by its number. Named references, e.g. ::name, match a named capturing group by its name. Relative references, e.g. ::-1 or ::+2, match a capturing group relative to the current position. For example, ::-1 matches the previous capturing group, ::+1 matches the next one. Note that some regex engines only support backreferences, not forward references. And even when forward references are supported, the referenced group must have been already matched. I.e., this is not allowed:
# doesn't work! ::1 :('test') However, forward references can be used in repetitions to match what the referenced group captured in the previous repetition:
(::forward | :forward('test') '!')* This matches the text test!test, for example. In the first repetition, the second alternative matches test!, and the text test is captured by the forward capturing group. In the second iteration, the forward reference matches the text test.
`}),e.add({id:20,href:"/docs/language-tour/regex/",title:"Inline Regexes",description:"Insert text in the output regex with no escaping",content:`Although Pomsky on its own is very powerful, there might be situations where its syntax is not expressive enough. In these rare situations, Pomsky has an escape hatch: The regex keyword allows specifying an expression that is embedded in the output verbatim, without escaping:
regex 'hello|world?' This emits the following regular expression:
hello|world? ⚠️ This is dangerous and should only be used as a last resort. Pomsky does not parse the content of regex expressions, so it cannot ensure that the output is valid. For example, this is how you could use subroutines (which are not officially supported by Pomsky):
:octet(range '0'-'255') ('.' regex '\\g\u0026lt;octet\u0026gt;'){3} Note that Pomsky wraps a regex expression in a non-capturing group if it is followed by a repetition or surrounded by parentheses.
Finish line #This concludes the language tour! I hope
`}),e.add({id:21,href:"/docs/reference/",title:"Reference",description:"Detailed technical info",content:""}),e.add({id:22,href:"/docs/reference/grammar/",title:"Formal grammar",description:"Pomsky's syntax specification",content:`Summary #This document uses Pomsky syntax to describe Pomsky\u0026rsquo;s syntax. Here\u0026rsquo;s an incomplete summary, which is enough to read the grammar:
Variables are declared as let var_name = expression;. This means that var_name can be parsed by parsing expression.
Verbatim text is wrapped in double quotes ("") or single quotes ('').
A * after a rule indicates that it repeats 0 or more times.
A + after a rule indicates that it repeats 1 or more times.
A ? after a rule indicates that the rule is optional.
Rules can be grouped together by wrapping them in parentheses (()).
Alternative rules are each preceded by a vertical bar (|).
Formal grammar #Comments start with # and end at the end of the same line. Comments and whitespace are ignored; they can be added anywhere between tokens. Tokens are
identifiers (e.g. foo) keywords and reserved words (e.g. lazy) operators and punctuation (e.g. \u0026lt;\u0026lt; or ;) numbers (e.g. 30) string literals (e.g. \u0026quot;foo\u0026quot;) codepoints as documented here in detail.
Note about this grammar #Even though this grammar is written using Pomsky syntax, it isn\u0026rsquo;t actually accepted by the Pomsky compiler, because it uses cyclic variables.
Expression #let Expression = Statement* Alternation; See Alternation.
Statement #let Statement = | LetDeclaration | Modifier | Test; See LetDeclaration, Modifier, Test.
FixExpression #An expression which can have a prefix or suffix.
let FixExpression = | Lookaround | Negation | Repetition; See Lookaround, Negation, Repetition.
AtomExpression #let AtomExpression = | String | CodePoint | Group | CharacterSet | InlineRegex | Boundary | Reference | NumberRange | Variable | Dot | Recursion; See String, CodePoint, Group, CharacterSet, InlineRegex, Boundary, Reference, NumberRange, Dot, Recursion.
`}),e.add({id:23,href:"/docs/reference/tokens/",title:"Tokens",description:"The smallest parts of Pomsky's syntax",content:`Tokens (also called terminals) cannot be further divided. There are the following token types used in the grammar:
Name #Names (or identifiers) consist of a letter or underscore (_), followed by any number of letters, digits and underscores. For example:
# valid identifiers hello i18n _foo_ Gänsefüßchen # invalid identifiers kebab-case 42 👍 A letter is any code point with the Alphabetic property, which can be matched in most regex flavors with \\p{Alpha}. A digit is any code point from the Number general categories, which can be matched in most regex flavors with \\pN.
Note that group names have more restrictions than variable names: They must be ASCII-only and may not contain underscores.
Identifiers may not be one of the following reserved words:
U let lazy greedy range base atomic enable disable if else recursion regex test There are some contextual keywords that have a special meaning only in a certain context:
match reject as in unicode Contextual keywords can be used as variable and group names without issues.
Number #A whole number without a sign and without leading zeros. For example:
# valid numbers 0 1 42 10000 # invalid numbers 042 -30 +30 30.1 10_000 10,000 String #A string is a sequence of code points surrounded by single or double quotes. In double quoted strings, double quotes and backslashes are escaped by preceding them with a backslash. No other escapes are supported. Single quoted strings don\u0026rsquo;t support any escaping:
# valid strings 'test' \u0026quot;test\u0026quot; \u0026quot;C:\\\\User\\\\Dwayne \\\u0026quot;The Rock\\\u0026quot; Johnson\u0026quot; 'C:\\User\\Dwayne \u0026quot;The Rock\u0026quot; Johnson' 'this is a multiline string' \u0026quot;this is a multiline string\u0026quot; # invalid strings \u0026quot;\\n\u0026quot; \u0026quot;\\uFFFF\u0026quot; '\\'' Within string literals, \\r\\n (CRLF) sequences are replaced with a single \\n (LF). This is because text editors do not display the type of line ending, so users might save a Pomsky file with the wrong file ending by accident. In most regex engines, \\n matches a line break regardless of the platform convention used.
StringOneChar #Same as String, with the limitation that the string must contain exactly one code point. Example:
'a' 'ŧ' \u0026quot;\\\\\u0026quot; CodePoint #A codepoint consists of U, +, and 1 to 6 hexadecimal digits (0-9, a-f, A-F). It must represent a valid Unicode scalar value. This means that it must be a valid codepoint, but not a UTF-16 surrogate. For example:
# valid codepoints U+0 U+10 U+FFF U+10FFFF U + FF # invalid codepoints U+300000 U+00000001 U+D800 U+FGHI The code point token is \u0026lsquo;special\u0026rsquo; in that the + may be surrounded by spaces.
Punctuation #Punctuation tokens consist of visible ASCII characters. Most punctuation tokens are exactly one character, except for \u0026lt;\u0026lt;, \u0026gt;\u0026gt;, and ::. The full list of supported punctuation tokens is
\u0026gt;\u0026gt; \u0026lt;\u0026lt; :: ^ $ \u0026lt; \u0026gt; % * + ? | : ( ) { } , ! [ - ] . ; = Pomsky\u0026rsquo;s lexer can also lex a variety of illegal constructs, e.g. backslash escapes like \\g\u0026lt;0\u0026gt; and groups such as (:?), in order to show more useful error messages.
`}),e.add({id:24,href:"/docs/reference/constructs/",title:"Constructs",description:"Language constructs reference",content:""}),e.add({id:25,href:"/docs/reference/constructs/alternation/",title:"Alternation",description:"Reference – Matching one of several alternatives",content:`An alternation matches one of several alternatives.
Syntax #let Alternation = ('|'? Alternatives)?; let Alternatives = Alternative ('|' Alternative)*; let Alternative = FixExpression+; See FixExpression.
Note that an alternation may have a leading pipe. Also note that an alternative may not be empty, i.e. | | is not allowed. Use an empty string instead, e.g. 'foo' | '' | 'bar'.
Example #| 'hello' | 'pomsky'+ Support #Alternation is supported in all flavors.
Behavior #Alternatives are matched consecutively. The first alternative that matches is used.
Compilation #Compiled to an alternation. The example above would be compiled to hello|(?:pomsky)+.
Issues #Alternations aren\u0026rsquo;t yet properly optimized. Planned optimizations include:
Common prefixes: 'test' | 'testament' | 'testing' to test(?:|ament|ing) Single char alternation: 'a' | 'c' | '?' to [ac?] History # Support for leading pipes added in Pomsky 0.6 Initial implementation in Pomsky 0.1 `}),e.add({id:26,href:"/docs/reference/constructs/boundary/",title:"Boundaries",description:"Reference – Assert that a position is a word boundary or anchor",content:`Boundaries (word boundaries and anchors) are assertions that match if the current position has a certain property.
Syntax #let Boundary = | '^' | '$' | '%' | '\u0026lt;' | '\u0026gt;'; Example #^ $ # match empty string % 'foo' % # match 'foo' surrounded by word boundaries !% 'foo' !% # match 'foo' not surrounded by word boundaries \u0026lt; 'foo' \u0026gt; # match 'foo' as a whole word Support #Anchors (^ and $) are supported in all flavors. Word boundaries (%, \u0026lt;, and \u0026gt;) are not supported in JavaScript unless Unicode is disabled.
Support for boundaries is gated by the boundaries feature. Specify features with the --allowed-features option.
Behavior #All boundaries are assertions – they match between two characters. They do not contain any text, and repeating them has no effect.
Anchors #^ and $ are anchors. They match at the start and end of the string, respectively. Regex engines usually have a way to change their behavior to match at the start and end of the line instead.
They have the built-in Start and End variables as aliases.
Word boundaries #% is a word boundary, which matches either at the start or at the end of a word. \u0026lt; only matches at the start of a word, \u0026gt; only at the end. Surround a word with % % or with \u0026lt; \u0026gt; to make sure it doesn\u0026rsquo;t match a substring of a word, e.g. test in the word detest.
A word boundary is a position next to a \u0026ldquo;word character\u0026rdquo; (matching [word]), but only on one side. A word character is a character in one of the following Unicode general categories:
Alphabetic Mark Decimal_Number Connector_Punctuation Join_Control In the ASCII subset of Unicode, this would be the letters a-z and A-Z, the digits 0-9, and the underscore _.
The % word boundary is the only boundary that can be negated. !% matches a position that is not a word boundary, which means that it must be surrounded by either 0 or 2 word characters.
Relation to lookaround #Every boundary can be expressed in terms of lookaround assertions:
Boundary Equivalent lookarounds ^ !\u003c\u003c C $ !\u003e\u003e C % (\u003c\u003c[w]) (!\u003e\u003e[w]) \u0026#x7c; (!\u003c\u003c[w]) (\u003e\u003e[w]) !% (\u003c\u003c[w]) (\u003e\u003e[w]) \u0026#x7c; (!\u003c\u003c[w]) (!\u003e\u003e[w]) \u003c (!\u003c\u003c[w]) (\u003e\u003e[w]) \u003e (\u003c\u003c[w]) (!\u003e\u003e[w]) Compilation #Anchors are compiled verbatim to ^ and $. Word boundaries are compiled to \\b, or \\B when negated.
\u0026lt; and \u0026gt; are compiled to
[[:\u0026lt;:]] and [[:\u0026gt;:]] when targeting PCRE \\\u0026lt; and \\\u0026gt; when targeting Rust (?\u0026lt;!\\w)(?=\\w) and (?\u0026lt;=\\w)(?!\\w) when targeting any other flavor Issues #In JavaScript, word boundaries are never Unicode aware, so they are only allowed when Unicode is explicitly disabled.
In other flavors, word boundaries are always Unicode aware, even when Unicode has been disabled.
History # Added \u0026lt; and \u0026gt; in Pomsky 0.11 Forbidden % in JavaScript unless Unicode is disabled in Pomsky 0.10 Removed deprecated \u0026lt;% and %\u0026gt; syntax in Pomsky 0.7 Added ^ and $ in Pomsky 0.6 Added Start and End variables in Pomsky 0.4.2 Initial implementation in Pomsky 0.1 Using old syntax \u0026lt;% and %\u0026gt; instead of ^ and $ `}),e.add({id:27,href:"/docs/reference/constructs/charset/",title:"Character Set",description:"Reference – Matching one of several code points",content:`A character set allows matching one of several code points.
Syntax #let CharacterSet = '[' CharacterSetInner+ ']'; let CharacterSetInner = | Range | String | CodePoint | NonPrintable | Shorthand | UnicodeProperty | AsciiShorthand; let Range = SingleChar '-' SingleChar; let SingleChar = | StringOneChar | CodePoint | NonPrintable; # deprecated! let NonPrintable = | 'n' | 'r' | 't' | 'a' | 'e' | 'f'; let Shorthand = '!'? ShorthandIdent; let ShorthandIdent = | 'w' | 'word' | 'd' | 'digit' | 's' | 'space' | 'h' | 'horiz_space' | 'v' | 'vert_space' let AsciiShorthand = | 'ascii' | 'ascii_alpha' | 'ascii_alnum' | 'ascii_blank' | 'ascii_cntrl' | 'ascii_digit' | 'ascii_graph' | 'ascii_lower' | 'ascii_print' | 'ascii_punct' | 'ascii_space' | 'ascii_upper' | 'ascii_word' | 'ascii_xdigit'; let UnicodeProperty = '!'? Name; Example #['ad' 'f'-'x' Greek digit n U+FEFF] Support #Character sets are supported in all flavors. However, not all Unicode properties are supported in all flavors.
Furthermore, in .NET, character sets incorrectly match UTF-16 code units rather than code points. This means that a character set can not be used for characters outside the Basic Multilingual Plane (BMP) in .NET.
In JavaScript, word cannot be negated if the character set contains other items as well. For example, [!word s] does not work. The reason is that \\w is polyfilled in JavaScript to be Unicode aware.
Behavior #A character set matches a single Unicode code point. It is surrounded by [] square brackets and can contain an arbitrary number of characters, code points, character ranges, non-printable characters, shorthand character classes, and Unicode properties.
A character set is a set in the mathematical sense, matching the union of everything written in the square brackets.
Code Points and Characters #Character sets can contain code points such as U+20 or U+FEFF, and strings, which are treated as the set of their code points. For example, ['ace'] is equivalent to ['a' 'c' 'e'], or [U+61 U+63 U+65].
In .NET, only code points in the BMP are allowed.
Character ranges #Ranges of code points can be specified like ['a'-'z'] or [U+40-U+50]. Ranges must be ascending and non-empty: The first code point must be lower than the second code point, so they constitute a lower and upper bound. Both bounds are included in the set. Each bound can be either a string containing exactly one code point, a code point literal, or a non-printable character (see below). Non-printable characters in ranges are deprecated.
Non-printable characters #There are 6 non-printable ASCII characters with a special syntax:
a is equivalent to U+07 (bell) t is equivalent to U+09 (horizontal tabulation) n is equivalent to U+0A (new line) e is equivalent to U+1B (vertical tabulation) f is equivalent to U+0C (form feed) r is equivalent to U+0D (carriage return) Shorthands #There exist a variety of shorthands that can be used in a character set.
The following general shorthands exist, each of which has a full name and a single-character alias:
Full name Alias Equivalent word w Alphabetic Mark Decimal_Number Connector_Punctuation Join_Control digit d Decimal_Number space s White_Space horiz_space h U+09 Space_Separator vert_space v U+0A-U+0D U+85 U+2028 U+2029 The following ascii shorthands exist:
Name Equivalent ascii U+00-U+7F ascii_alpha 'a'-'z' 'A'-'Z' ascii_alnum 'a'-'z' 'A'-'Z' '0'-'9' ascii_blank ' ' t ascii_cntrl U+00-U+1F U+7F ascii_digit '0'-'9' ascii_graph U+21-U+7E ascii_lower 'a'-'z' ascii_print U+20-U+7E ascii_punct U+21-U+2F U+3A-U+40 U+5B-U+60 U+7B-U+7E ascii_space ' ' t n r e f ascii_upper 'A'-'Z' ascii_word 'a'-'z' 'A'-'Z' '0'-'9' '_' ascii_xdigit '0'-'9' 'a'-'f' 'A'-'F' Pomsky supports all Unicode general properties, scripts, blocks, and other boolean properties. However, not all Unicode properties are supported in every flavor. For example, Python does not support Unicode properties at all, JavaScript does not support blocks, and Java does not support most boolean properties.
Details about supported Unicode properties can be found here.
Negation of shorthands #Shorthands (except for ASCII shorthands) are special in that they can be negated. However, only a single exclamation mark is allowed in front of shorthands, so no double negation is possible.
There are some exceptions though: v and h can\u0026rsquo;t be negated. w can\u0026rsquo;t be negated when targetting JavaScript. This restriction could be lifted once the /v flag becomes widely supported.
Compilation #Usually, compiling character sets is straightforward, but there are some edge cases. Character sets translate to brackets ([···]), usually called \u0026ldquo;character classes\u0026rdquo; in regex lingo. Negated character sets translate to negative character classes ([^···]). Negating a single-character string also produces a character class, whereas a non-negated character class with only a single element is unwrapped:
['ad'] # [ad] !['ad'] # [^ad] !'a' # [^a] ['a'] # a Pomsky removes duplicate items and eliminates double negation where possible:
['test'] # [tes] ![!word] # \\w Special characters are escaped when needed, but ^ is only escaped if it is the first character:
['[]-^\u0026amp;\\'] # [\\[\\]\\-^\\\u0026amp;\\\\] ['^'] # [\\^] \u0026amp; and | are not escaped when targeting JavaScript.
Also, word/w and digit/d are polyfilled in JavaScript using equivalent Unicode general categories. vert_space/v and horiz_space/h are polyfilled in all flavors except PCRE and Java. ASCII shorthands are polyfilled everywhere, even though they are supported in PCRE as \u0026ldquo;POSIX classes\u0026rdquo;.
Issues #Behavior is incorrect in .NET (see above).
Union and intersection of sets is not yet implemented.
The expression ['\u0026amp;' '\u0026amp;'-'Z'] miscompiles in JS with the /v flag because \u0026amp; is not escaped.
History # Deprecated shorthands in character ranges in Pomsky 0.11 Extended set of supported Unicode properties in Pomsky 0.10 Added support for Unicode blocks and boolean properties in Pomsky 0.8 Deprecated [.], [codepoint] and [cp] syntax in Pomsky 0.6 Added shorthand aliases word, digit, space, horiz_space, vert_space in Pomsky 0.3 ASCII shorthands renamed to begin with ascii_ in Pomsky 0.3 Initial implementation in Pomsky 0.1 `}),e.add({id:28,href:"/docs/reference/constructs/dot/",title:"Dot",description:"Reference – Matching an arbitrary code point except a line break",content:`The dot matches an arbitrary code point except a line break. In multiline mode, the dot also matches line breaks.
Syntax #let Dot = '.'; Example #.{4,12} Support #The dot is supported in all flavors.
In .NET, the dot matches a UTF-16 code unit rather than a full code point, so a character outside the Basic Multilingual Plane matches two dots (..).
Support for the dot is gated by the dot feature. Specify features with the --allowed-features option.
Behavior #The dot matches a single code point (except in .NET, see above), but not a line break.
Regex engines disagree on what constitutes a line break character. This is explained here in detail. To get the same behavior everywhere, use ![n] or ![v] instead.
When multiline mode is enabled in the regex engine, the dot matches every code point, including line breaks.
Compilation #The dot is compiled to ..
Issues #Regex engines disagree on what constitutes a line break character, so the dot is not really portable.
History #Added in Pomsky 0.8
`}),e.add({id:29,href:"/docs/reference/constructs/group/",title:"Groups",description:"Reference – Capturing and grouping multiple expressions",content:"Multiple expressions can be grouped together by wrapping them in parentheses. Capturing groups can be used to extract information from a match.\nSyntax #let Group = GroupKind? '(' Expression ')'; let GroupKind = | ':' Name? | 'atomic'; See Expression.\nA group name must be ASCII-only and may not contain underscores. Furthermore, a group name must be no longer than 32 characters. For example:\n:underscores_are_invalid() :äöéŧûøIsInvalid() :thisGroupNameIsTooLongUnfortunately() :thisIsAllowed() These restrictions exist because of Java. To make Pomsky behave consistently across regex flavors, we have to use the most restrictive rules for all flavors.\nExample #('a' | 'bc')* 'd' :('foo') :bar('bar') atomic('atomic') Support #Normal and capturing groups are supported in all flavors. Atomic groups are only supported in the Java, PCRE, Ruby, and .NET flavors.\nSupport for capturing groups is gated by the numbered-groups and named-groups features. Support for atomic groups is gated by the atomic-groups feature. Specify features with the --allowed-features option.\nBehavior #Normal groups #Normal groups (those that are neither capturing nor atomic) have no effect; their only purpose is to group multiple expressions together. These are all equivalent:\n'test' ('test') ((('test'))) Groups are sometimes required to disambiguate which parts of the expression \u0026ldquo;belong together\u0026rdquo;:\n('a' .){3} # equivalent to a.a.a. 'a' .{3} # equivalent to a... 'a' ('b' | 'c') # equivalent to ab|ac 'a' 'b' | 'c' # equivalent to ab|c Capturing groups #Capturing groups are used to \u0026ldquo;capture\u0026rdquo; parts of a match, which can then be used, e.g. for text substitution or further processing. For example, JavaScript allows you to reference captures in the substitution string when using String.replace:\n'1.13.5'.replace(/(\\d+)\\.(\\d+)\\.(\\d+)/, 'v$1_$2') === 'v1_13' In Pomsky, capturing groups are prefixed with a :, and optionally a name. Capturing groups are always preserved, unlike normal groups, which can be optimized away.\nIn repetitions, captures are overwritten in each iteration. For example, when matching :('foo' | 'bar')+ against the string foobar, the capturing group will contain the string bar in the end.\nAtomic groups #Atomic groups (prefixed with atomic) are an optimization for backtracking regex engines. An atomic group ensures that, once it has matched successfully, the regex engine never tries to backtrack into the group again.\nAtomic groups are not capturing.\nCompilation #Normal groups are conceptually equivalent to non-capturing groups in regular expressions. However, normal groups may be removed, and non-capturing groups may be created out of thin air by Pomsky.\nCapturing groups without a name are compiled to parentheses: :('foo') becomes (foo).\nHow named capturing groups are compiled depends on the regex engine. When targeting Python, PCRE, or Rust, :name('foo') is compiled to (?P\u0026lt;name\u0026gt;foo). Otherwise, it is compiled to (?\u0026lt;name\u0026gt;foo); note the missing P.\nAtomic groups are compiled to (?\u0026gt;...).\nIssues #Because lookbehinds should be evaluated in reverse direction, but aren\u0026rsquo;t in many regex engines, repeated groups in a lookbehind assertion may capture different values depending on the regex engine. For example, the regex (?\u0026lt;=(a|b){2})\\1 matches aba in JS, but abb in PCRE2.\nHistory # Atomic groups added in Pomsky 0.7 Initial implementation in Pomsky 0.1 "}),e.add({id:30,href:"/docs/reference/constructs/inline-regex/",title:"Inline Regex",description:"Reference – Embed regexes in Pomsky",content:`Inline regexes allow you to embed regular expressions in Pomsky.
Syntax #let InlineRegex = 'regex' String; Example #regex '(?2)' # subroutine regex '[\\w--[\\p{Latin}]]' # character set subtraction Support #Inline regexes are supported in all flavors.
Support for inline regexes is gated by the regexes feature. Specify features with the --allowed-features option.
Behavior #Inline regexes can do anything the targeted regex engine supports. However, inline regexes may not be as portable, because different regex engines use slightly different syntax for some features.
Compilation #The string content is emitted by the compiler verbatim. If the expression is repeated, it is wrapped in a non-capturing group. Pomsky also adds a non-capturing group if the inline regex is surrounded by parentheses. For example:
Pomsky Compiled regex regex 'a\u0026#x7c;b' a|b regex 'a\u0026#x7c;b'+ (?:a|b)+ regex 'a\u0026#x7c;b' 'c' a|bc (regex 'a\u0026#x7c;b') 'c' (?:a|b)c This is the only situation where parentheses affect the compiled regex, even though they do not affect precedence in the Pomsky expression.
Issues #Pomsky doesn\u0026rsquo;t know when the inline regex needs to be wrapped in a non-capturing group. No group is added when multiple expressions are concatenated, which may be incorrect if the inline regex is an alternation.
History #Initial implementation in Pomsky 0.8
`}),e.add({id:31,href:"/docs/reference/constructs/lookaround/",title:"Lookaround",description:"Reference – Assert what appears before or after a position",content:`Lookarounds assert that a certain expression matches before or after the current position. As an assertion, a lookaround does not contain any text; it matches between two code points.
Syntax #let Lookaround = LookaroundPrefix Expression; let LookaroundPrefix = | '\u0026lt;\u0026lt;' | '\u0026gt;\u0026gt;'; See Expression.
A lookaround must be wrapped in parentheses if it is followed by another expression:
(\u0026gt;\u0026gt; [word]) [Greek] Note that a lookaround contains an expression, so it introduces a new scope and can include statements.
Example #(!\u0026lt;\u0026lt; [w]) (\u0026gt;\u0026gt; disable unicode; let aw = [w]; aw{3} ) Support #Support for lookaround is gated by the lookahead and lookbehind features. Specify features with the --allowed-features option.
Lookahead is supported almost everywhere. Lookbehind support is more limited:
PCRE #PCRE does not support arbitrary-length lookbehind. PCRE must be able to determine the length of the lookbehind in advance, so \u003c\u003c 'foo'{3} works, but \u003c\u003c 'foo'+ does not. PCRE has a special case that a lookbehind containing an alternation works even if the alternatives have different lengths, but each alternative must be constant-length.
JavaScript #JavaScript fully supports lookahead and lookbehind. However, lookbehind is still unsupported in some older browsers (notably, Safari up to version 16.3).
Java #Before Java 13, repetition in lookbehind was required to be finite, * and + did not work. Since Java 13, repetition can be unbounded, but may not correctly handle repetition with multiple quantifiers if one of them is unbounded. Lookbehind also may not contain backreferences.
Python #Python supports lookahead and constant-length lookbehind. Repetitions and alternations like \u003c\u003c 'a' | 'bb' are forbidden in lookbehind.
Ruby, .NET #Full support for both lookahead and lookbehind
Rust #Lookaround not supported
Behavior #Lookahead checks if the contained expression matches at the current position. If it matches, the lookahead succeeds, otherwise it fails. Lookahead can be negated. A negative lookahead succeeds if the expression does not match. After the lookahead succeeded, the regex engine returns to the position in the string where it was before the lookahead, so the string matching the lookahead is not consumed.
Conceptually, lookbehind works in the same way, except that the expression is matched in reverse direction against the text preceding the current position. In reality, however, many regex engines do not match in reverse direction but go back n characters and check if the next n characters match the lookbehind.
Compilation # \u0026gt;\u0026gt; ... is compiled to (?=...) !\u0026gt;\u0026gt; ... is compiled to (?!...) \u0026lt;\u0026lt; ... is compiled to (?\u0026lt;=...) !\u0026lt;\u0026lt; ... is compiled to (?\u0026lt;!...) Issues #The various limitations on lookbehind by different regex engines are not enforced at the moment.
Security concerns #Lookbehind can be slow in some regex engines.
History #Initial implementation in Pomsky 0.1
`}),e.add({id:32,href:"/docs/reference/constructs/modifier/",title:"Modifier",description:"Reference – Change how the expression should be treated",content:`Modifiers change how the following expression should be treated.
Syntax #let Modifier = ModifierKeyword BooleanSetting ';'; let ModifierKeyword = | 'enable' | 'disable'; let BooleanSetting = | 'lazy' | 'unicode'; Example #enable lazy; disable unicode; [w]* ( disable lazy; .+ ) Support #Modifiers are supported in all flavors.
Support for each mode is gated by the lazy-mode and ascii-mode features. Specify features with the --allowed-features option.
Behavior #Modes can be enabled and disabled in any scope.
There are two modifiers that can be enabled or disabled:
Lazy #Enabling lazy mode means that all repetitions in the same scope are lazy by default; opting out is done with the greedy keyword, e.g.
enable lazy; [w]* greedy Unicode #Unicode mode is enabled by default. Disabling it means that the expression in the same scope is no longer Unicode aware and assumes an ASCII-only input. As a result, shorthand character classes are compiled differently (e.g. [space] is compiled to [ \\t-\\r]), and Unicode properties (e.g. [Greek]) are unavailable. Non-ASCII strings and code points are still allowed.
In JavaScript, Unicode must be disabled in order to use %, \u0026lt; and \u0026gt; word boundaries.
Disabling Unicode can vastly improve runtime performance, especially for [word] and [digit]. Alternatively, you can use [ascii_word], [ascii_digit], and so on.
Compilation #Modifiers produce no output, but they change how other expressions are compiled.
Issues #The dot and word boundaries are Unicode-aware in some regex engines even when Unicode mode is disabled.
Some mode modifiers are not yet implemented, most importantly ignore_case, single_line and multi_line.
History # Non-Unicode mode added in Pomsky 0.10 Lazy mode added in Pomsky 0.3 `}),e.add({id:33,href:"/docs/reference/constructs/negation/",title:"Negation",description:"Reference – Negating what an expression matches",content:`Anything that can be compiled to a lookaround, word boundary, or a character set can be negated.
Syntax #let Negation = '!' FixExpression; See FixExpression.
Example #let no_boundary = !%; !no_boundary (!\u0026gt;\u0026gt; !'a') Support #Negation is supported in all flavors.
Behavior #The following kinds of expression can be negated:
Word boundary % Lookarounds \u0026lt;\u0026lt;, \u0026gt;\u0026gt; Character set [...] Strings with exactly one code point Negations Negation happens late in the compilation process, after variable and range expansion, and some optimizations. It unwraps non-capturing groups with only one element.
Arbitrary nesting is allowed; !!x is equivalent to x, if x is negatable.
Compilation #Negated word boundaries are compiled to \\B. In JavaScript, this requires that Unicode is disabled. Negative lookbehind is compiled to (?\u0026lt;!...), negative lookahead is compiled to (?!...). Negative character sets are compiled to [^...]. When a character set contains exactly one shorthand, we try to just negate the shorthand to remove the character set; for example, ![s] can be compiled to \\S.
Pomsky requires that a character set is not empty. [] is rejected at the syntax level. Another way to create an empty character set is to negate a full character set, e.g. ![s !s]. This must therefore forbidden.
Issues #Detecting full character sets is not yet implemented properly. The current implementation only rejects some common cases, like ![w !w], but fails to reject ![w !d], for example.
Negation currently does not work for alternations like !('a' | 'c').
History # Negation changed to a late compilation step in Pomsky 0.11 ! syntactically allowed everywhere resolved after variable and range expansion can unwrap groups and turn single-char strings into character sets arbitrary nesting of negations Initial implementation in Pomsky 0.1 ! syntactically only allowed before %, \u0026lt;\u0026lt;, \u0026gt;\u0026gt;, and [ no double negation `}),e.add({id:34,href:"/docs/reference/constructs/number-range/",title:"Number ranges",description:"Reference – Matching a multi-digit number",content:`Number ranges are used to match multi-digit numbers. If you only need to match a single digit, you can use a character set instead.
Syntax #let NumberRange = 'range' String '-' String Base?; let Base = 'base' Number; Note that the strings must contain digits or ASCII letters in the supported range. For example, in base 16, the characters 0123456789abcdefABCDEF are allowed. The base must be between 2 and 36.
The first string must be at most as long as the second string. The number in the first string must be smaller than the second one. If either string has a leading zero (that is, it begins with 0 and has a length \u0026gt; 1), both strings must have the same length.
Example #range '128'-'255' Support #Number ranges are supported in all flavors.
Support for number ranges is gated by the ranges feature. Specify features with the --allowed-features option.
There is a limit on the maximum number of digits. In the CLI, this limit is 12. In the WASM library, the limit is 6. This is meant to avoid excessive compile times, since the expansion has exponential runtime behavior.
Behavior #A number range greedily matches a number in a radix based number system (by default, a decimal number). The radix can be specified after the base keyword as an integer between 2 and 36.
Two strings serve as the lower and upper bound of the matched number (both inclusive). For example, range '8'-'12' matches 8, 9, 10, 11, and 12.
If neither bound has leading zeroes, the expression never matches a string with leading zeroes. Otherwise, both numbers must have the same number of digits, including leading zeroes. For example, range '007'-'300' is ok, but range '07'-'300' is not. The number 0 itself is not considered to have a leading zero.
If leading zeroes are present, a match must have the same length as the specified bounds. For example, range '007'-'300' matches 034, but not 34.
Compilation #Compilation of number ranges uses a complicated, novel algorithm to turn the range into a tree of alternations. For example:
Pomsky expression Compiled range '0'-'10' 0\u0026#x7c;10?\u0026#x7c;[2-9] range '0'-'63' 0\u0026#x7c;[1-5][0-9]?\u0026#x7c;6[0-3]?\u0026#x7c;[7-9] range '63'-'137' 1(?:[0-2][0-9]\u0026#x7c;3[0-7])\u0026#x7c;6[3-9]\u0026#x7c;[7-9][0-9] range '100'-'200' 1[0-9]{2}\u0026#x7c;20{2} The generated regex is a DFA, so for every digit there is at most one transition. This means that matching is very efficient, since regex engines never need to backtrack. ### Algorithm ℹ️ If this description is missing any information, read the program code to find the details. We always look only at the first digit of each bound; these digits are called \`ax\` (from lower bound) and \`bx\` (from upper bound). For simplicity, we assume that the radix is 10 (decimal), but the algorithm works for any radix. For example: \`\`\`js // range '4'-'705' a = [4] b = [7, 0, 5] ax = 4 bx = 7 By looking at the first digit, we can deduce:
The number can\u0026rsquo;t start with 0 (leading zeros aren\u0026rsquo;t allowed) The number can start with 1, 2 or 3, but then it must be followed with 1 or 2 more digit in that case The number can be 4, 5 or 6, in which case it can be followed by 0, 1 or 2 more digits If the number starts with 7, it can be followed by nothing a zero, and possibly a third digit that is at most 5 a digit greater than zero, if there is no third digit. If the number starts with 8 or 9, it can be followed by at most 1 more digit. This is implemented recursively. We always remove the first digit from the slices. We then create a number of alternatives, each starting with a different digit or range of digits:
0 ..= ax-1 ax ax+1 ..= bx-1 bx bx+1 ..= 9 If ax and bx are identical, 3. and 4. are omitted; if they\u0026rsquo;re consecutive numbers, 3. is omitted. If ax is 0 or bx is 9, 1. or 5. is omitted, respectively. If ax is bigger than bx, the alternatives are a bit different, and this is important later:
0 ..= bx-1 bx bx+1 ..= ax-1 ax ax+1 ..= 9 There is one more special case: The first digit in a number can\u0026rsquo;t be 0, unless the range\u0026rsquo;s lower bound is 0. So we check if we are currently looking at the first digit, and if that is the case, the first character class omits 0. If the lower bound is 0, then an alternative containing only 0 is added once.
Now, for each of the above alternatives, we return two things: A character class matching the first digit, and something matching the remaining digits. That something is calculated by recursively applying the algorithm on the remaining digits. To make sure that this doesn\u0026rsquo;t recurse infinitely, we must detect terminal calls (calls that stop recursing):
If both slices are empty, we are done.
If both slices contain exactly 1 digit, we simply add a character class matching a digit in that range.
If the first slice is empty but not the second one, we apply a trick: We add a 0 to the lower bound and try again. Also, the returned sub-expression is made optional.
For example, range([4], [4, 0, 0]) at some point adds an alternative starting with 4 and calls range([], [0, 0]) recursively. We want this to match the empty string, any single digit, or two zeros, because a \u0026ldquo;4\u0026rdquo; matching the range 4-400 can be followed by nothing, any single digit or two zeros.
If we just added a 0 to the lower bound, that would mean that the 4 MUST be followed by at least one more digit. We don\u0026rsquo;t want that, so we make the expression following the 4 optional.
If the second slice is empty but not the first, this is an error that should NEVER happen. The parser validates the input so that the upper bound can\u0026rsquo;t be smaller/shorter than the lower bound.
Now, about the alternatives: This part is quite interesting. To recap, the alternatives are either this:
0 ..= ax-1 ax ax+1 ..= bx-1 bx bx+1 ..= 9 or this, if bx \u0026gt; ax:
0 ..= bx-1 bx bx+1 ..= ax-1 ax ax+1 ..= 9 Step 1 and 5 are the same either way, if we substitute ax and bx with min(ax, bx) in step 1 and with max(ax, bx) in step 5:
1. [1-(min - 1)] [0-9]{la + 1, lb} (first digit) 1. [0-(min - 1)] [0-9]{la + 1, lb} (not first digit) 5. [(max + 1)-9] [0-9]{al, bl - 1} (la and lb are the lengths of the remaining digits in the lower and upper bound, respectively).
What is the deal with the added or subtracted 1\u0026rsquo;s? If we have a lower bound such as 533, the number must be at least 3 digits long. However, if the first digit is less than 5, it must be at least 4 digits long to be greater than 533. With the upper bound, it\u0026rsquo;s the opposite: For example, with an upper bound of 6111, the number can be at most 3 digits if it starts with 7, 8 or 9.
The last step is to optimize the alternatives to be as compact as possible. This is achieved by simplifying and merging alternatives if applicable. For example,
[0-4] [5-9] | 5 [5-9] This can be merged into [0-5] [5-9]. The rules are like addition and multiplication, where alternation (with |) is equivalent to + and concatenation is equivalent to *. This means we can use the distributive law: a * x + b * x = (a + b) * x. Note that we only do this if the first character class of each alternation are consecutive; for example, we merge [0-4] and 5, but not [0-4] and [6-9]. This would be possible in theory, but would be computationally more expensive, since the second part of each alternation must be checked for equality.
The next optimization is to replace concatenation of equal elements with repetition. In other words, we replace a + a with a * 2, and a + (a * 2) with a * 3. This is important, because when we check whether two expressions are equal, it only works if they have the exact same structure: [0-9][0-9] is not considered equal to [0-9]{2}. So this optimization also serves as a normalization, to ensure that equal alternatives can be merged.
Issues #A range expression can match only a part of a number. For example, the expression range '4'-'20' matches each digit in 68. Anchors, word boundaries or lookarounds may be needed to prevent this.
Security concerns #Because compilation time is exponential with respect to the maximum number of digits, large ranges can be used to mount a DoS attack. This is partially remedied by the digit number limit (see above).
History # Restriction added to no longer allow ranges with leading zeroes and variable length in Pomsky 0.11 Initial implementation in Pomsky 0.3 `}),e.add({id:35,href:"/docs/reference/constructs/recursion/",title:"Recursion",description:"Recursion – Recursively match the entire expression",content:`Recursion allows you to recursively match the entire expression.
Syntax #let Recursion = 'recursion'; Example #One can parse mathematical terms with the following:
let op = ['+-/*']; let num = [digit]+; '-'? (num | '(' recursion ')') atomic(op recursion)* Care is needed to avoid ambiguity when possible, to prevent infinite recursion or excessive backtracking.
Support #Recursion is only supported in PCRE and Ruby.
Support for recursion is gated by the recursion feature. Specify features with the --allowed-features option.
Behavior #When a recursion expression is encountered, the regex engine saves the state of the match and starts over matching the whole regular expression at the current position. When the match succeeds, it restores the previous state and continues. Repeated recursions form a stack, similar to how recursion works in programming languages. It is possible to backtrack from/into recursion; this can be limited with atomic groups.
Compilation #Recursion compiles to \\g\u0026lt;0\u0026gt;, which is equivalent to calling the 0\u0026rsquo;th capturing group as subroutine, since regex engines implicitly create a capturing group with index 0 containing the whole match.
Issues #Pomsky does not detect infinite recursion. Recursion can also cause excessive backtracking.
History # Added recursion in Pomsky 0.11 `}),e.add({id:36,href:"/docs/reference/constructs/reference/",title:"References",description:"Reference – Match the same string as a previously matched capturing group again",content:"References allow you to match the same string as a previously matched capturing group again.\nSyntax #let Reference = | '::' Name | '::' Sign? Number; let Sign = | '+' | '-'; Note that references must be ASCII-only, so the allowed characters are a-z, A-Z, and 0-9. Numbers may not appear at the start of the name.\nExample #'r' :hashes('#'*) '\u0026quot;' C* lazy '\u0026quot;' ::hashes This matches Rust\u0026rsquo;s raw strings, which look like r#\u0026quot;...\u0026quot;# or r###\u0026quot;...\u0026quot;###. They must have the same number of # signs at the start and at the end.\nSupport #References are supported in all flavors except Rust (with some limitiations).\nSupport for references is gated by the references feature. Specify features with the --allowed-features option.\nBehavior #Since this is a complex topic, I recommend the documentation from regular-expressions.info (part 1, part 2, part 3, part 4). Pomsky uses a different syntax:\nNumbered references look like ::3 Named references look like ::foo Relative references look like ::-2 or ::+2. Pomsky usually converts all backreferences to an absolute number, so it supports named and relative backreferences even when targeting flavors that do not. Apart from this, Pomsky inherits the behavior of references from the targeted regex flavor.\nNotably, in JavaScript and Python, no forward references are supported, only backreferences.\nIn JavaScript, references can not match something captured in a previous iteration in a repetition. Also, JavaScript is the only flavor where a backreference matches an empty string when the referenced group didn\u0026rsquo;t participate in the match.\nPomsky only supports reference numbers up to 99. This restriction is imposed by Python.\nCompilation #References are usually compiled by looking up the index of the referenced group, and then emitting a numbered backreference such as \\3.\nThe exception is Ruby, where a named capturing group cannot be referenced by a numbered backreference. Therefore Pomsky checks if the referenced group has a name. If that\u0026rsquo;s the case, a named backreference, e.g. \\k\u0026lt;foo\u0026gt;, is emitted.\nIssues #References directly followed by a digit are miscompiled, e.g. :() ::1 '0' produces ()\\10, which looks like the 10th capturing group.\nIn .NET, when named and unnamed capturing groups are mixed, they are numbered in a weird way, so the group numbers calculated by Pomsky are wrong.\nPomsky doesn\u0026rsquo;t yet verify that backreferences are valid in all cases.\nHistory # Added relative references in Pomsky 0.3 Initial implementation in Pomsky 0.1 "}),e.add({id:37,href:"/docs/reference/constructs/repetition/",title:"Repetition",description:"Reference – Matching an expression potentially multiple times",content:`Repetitions allow matching an expression multiple times.
Syntax #let Repetition = AtomExpression RepetitionSuffix*; let RepetitionSuffix = RepetitionCount RepetitionMode?; let RepetitionCount = | '*' | '+' | '?' | RepetitionBraces; let RepetitionBraces = | '{' Number '}' | '{' Number? ',' Number? '}'; let RepetitionMode = | 'greedy' | 'lazy'; See AtomExpression.
There is a restriction that a ? and + repetitions may not appear immediately after another repetition (unless the first repetition is followed by lazy or greedy). This is to prevent confusion, since .*? means a lazy repetition and .*+ a possessive repetition in regular expressions.
Example #.{4,12} .+ lazy ('test'{3,})? Support #Repetition is supported in all flavors. In some flavors, repetition is not supported within a lookbehind assertion.
Behavior #Every repetition has a lower bound and an optional upper bound. The braces ({lower,upper}) are the canonical way to represent a repetition.
Syntax Lower, upper bound .{,} 0, infinity .{4,} 4, infinity .{,10} 0, 10 .{4,10} 4, 10 .{5} 5, 5 .? 0, 1 .* 0, infinity .+ 1, infinity There are two repetition modes, greedy and lazy repetition. In greedy mode (the default), the regex engine tries to match the repeated expression as often as possible, whereas in lazy mode, the regex engines tries to match it as few times as possible.
The default repetition mode can be changed with enable lazy; or disable lazy;.
Compilation #Pomsky first determines the lower and upper bound of each repetition. After variable and range expansion, it may simplify nested repetitions using rules like the following:
x{1} = x ''{a,b} = '' x{a,b}{c} = x{a·c,b·c} x{a}{b,c} = x{a·b,a·c} (x{1,a})? = x{0,a} x{1,a}? = x{0,a} x*{a,b} = x* x{a,b}{c,d} = x{a·c,b·d} if both a and c are 0 or 1 Note that most of these optimizations are only valid if either both repetitions are lazy or both are greedy.
Pomsky will then produce a maximally compact repetition, using ?, + or ? if possible, or using {n}, {n,} or {m,n} otherwise. If the repetition is lazy, another ? is added. For example, 'x'{1,} lazy compiles to x+?.
Sometimes the repeated expression must be wrapped in a non-capturing group, e.g. 'test'_ is compiled to (?:test)_.
Security concerns #Repetition (especially when nested) can be extremely slow, exhibiting exponential runtime, when executing the regex in a backtracking regex engine. Most regex engines use backtracking.
From the regex flavors supported by Pomsky, only Rust never uses backtracking, so it can guarantee linear runtime performance with respect to the haystack × regex length.
History # Implemented basic optimizations im Pomsky 0.8 Made + following a repetition illegal in Pomsky 0.6 Made ? following a repetition illegal in Pomsky 0.3 Changed the default repetition mode from lazy to greedy in Pomsky 0.3 Implemented in Pomsky 0.1 `}),e.add({id:38,href:"/docs/reference/constructs/tests/",title:"Tests",description:"Reference – Writing unit tests in Pomsky",content:`Tests are written in a test {} block to unit test the expression by matching it against a list of strings.
Syntax #let Test = 'test' '{' TestCase* '}'; let TestCase = | TestCaseMatch | TestCaseMatchAll | TestCaseReject; let TestCaseMatch = 'match' TestCaseSingleMatch ';'; let TestCaseMatchAll = 'match' TestCaseMatches? 'in' String ';'; let TestCaseReject = 'reject' 'in'? String ';'; let TestCaseMatches = TestCaseSingleMatch (',' TestCaseSingleMatch)*; let TestCaseSingleMatch = String TestCaptures?; let TestCaptures = 'as' '{' TestCapturesInner? '}'; let TestCapturesInner = TestCapture (',' TestCapture)* ','?; let TestCapture = TestCaptureName ':' String; let TestCaptureName = | Number | Ident; Test may only appear in the top-level scope, so the following is forbidden:
( test {} # ERROR 'foo' ) Example #test { match '[email protected]'; match '[email protected]' as { 1: 'john.doe', domain: 'mail.com' }; match '[email protected]', '[email protected]!' in 'My addresses are [email protected] and [email protected]!'; reject 'john.doe@mailbox'; reject in 'There is no valid email@address in this string'; } :(![s '@']+) '@' :domain(![s '@']+ '.' ![s '@']) Support #Tests are supported in all flavors, but can only be executed with PCRE2 at the moment. This is done by passing --test=pcre2 to the CLI.
Behavior #A test case can either assert that something matches (with match) or does not match (with reject). Pomsky supports two matching modes, exact match and substring match. The substring matching mode is used when the test case includes the in keyword.
Syntax Behavior match 'foo'; expects exact match reject 'foo'; expects no exact match match 'f', 'o' in 'foo'; expects substring match \u0026lsquo;f\u0026rsquo; reject in 'foo'; expects no substring matches When using substring matches, all matches must be specified and in the correct order. The matches do not overlap. You can specify capturing groups for each substring match individually.
When specifying capturing groups, you do not need to specify all of them; only the specified groups are compared. They can appear in any order. Unnamed capturing groups are assigned an ascending number, starting with 1. The capturing group 0 is the entire match.
Tests are only executed when the --test option is used in the CLI.
Compilation #Tests do not produce any output. However, when the --test flag is used, the expression is compiled twice: Once for the target flavor, and again for PCRE2. The compiled PCRE2 pattern is used for testing and discarded afterwards.
Issues #The potential mismatch between the target flavor and the flavor used for testing can result in false positives (where PCRE2 accepts a pattern that is illegal in the target flavor) and false negatives (where PCRE2 fails to match a pattern that would match in the target flavor).
One common example is when the expression contains a lookbehind assertion of variable length. PCRE2 only supports constant-length lookbehind due to technical limitations.
Security concerns #Since PCRE2 is a backtracking regex engine, an attacker should not be allowed to compile and test untrusted Pomsky expressions on a server, as this can lead to exponential backtracking and exhaust the server\u0026rsquo;s resources.
History #Initial implementation in Pomsky 0.11
Supports only PCRE2 `}),e.add({id:39,href:"/docs/reference/constructs/variables/",title:"Variables",description:"Reference – Declaring and using variables",content:`Variables can be declared and later used to keep your code DRY. Variables are inlined into the resulting expressions, similarly to macros in some programming languages.
Syntax #let LetDeclaration = 'let' Name '=' OrExpression ';'; let Variable = Name; Variabes are used simply by mentioning their name.
Example #let number = [digit]+; let identifier = [ascii_alnum '-']+; let identifiers = identifier ('.' identifier)*; number '.' number '.' number ('-' identifiers)? ('+' identifiers)? Support #Variables are supported in all flavors, since they are inlined.
Support for variables is gated by the variables feature (enabled by default). Specify features with the --allowed-features option.
Behavior #A variable may be used multiple times, but not recursively:
let a = '.' b; let b = ':' a?; # ERROR This is because variables are inlined, so recursion would produce a regex of infinite size.
Variable declarations must be written before the actual expression. They can be nested within groups and lookaround assertions. When nested, the variables can only be used within the enclosing scope:
( let foo = 'foo'; foo # allowed ) foo # ERROR Variables from an outer scope can be \u0026ldquo;shadowed\u0026rdquo; (redeclared) in an inner scope. When using it in the inner scope, it refers to the inner (shadowed) declaration, but when using it in the outer scope, it refers to the outer variable:
let foo = '1'; ( let foo = '2'; foo # 2 ) foo # 1 Technically, these are considered two different variables that just happen to have the same name, but the inner variable is only accessible within the group in which it was declared.
Variables can depend on each other, as long as there are no cycles, and the order of declarations does not matter. Notably, a variable can be used before it was declared:
let a = b b; let b = 'test'; a There are a few built-in variables. These can also be shadowed.
Built-in variables #There are 6 built-in variables:
Grapheme matches a single extended grapheme cluster. It compiles to the regex \\X. Note that this functionality is not available in all regex flavors. G is an alias for Grapheme Codepoint matches a single Unicode code point. It compiles to the regex [\\s\\S]. C is an alias for Codepoint Start: Matches the start of the string. Equivalent to ^. End: Matches the end of the string. Equivalent to $. Compilation #Compilation works by recursively substituting variables with the expression in their declaration. This is called expansion:
let a = '.' b?; let b = 'test'*; a+ becomes:
let b = 'test'*; ('.' b?)+ becomes:
('.' 'test'*)+ Note that expressions sometimes need to be wrapped in a group. Also, the expansion sometimes enables optimizations, such as the removal of the ? repetition above.
Issues #Because of the way variables are compiled, the resulting regex can be quite large \u0026ndash; so large, in fact, that regex engines may run out memory trying to compile them into a state machine. This is particularly likely in the Rust flavor. To remedy this, be careful how often you use variables that expand to complicated expressions.
Security concerns #Expansion of variables is not cached, so compilation time can be exponential, see the Billion Laughs Attack as an example.
An attacker should not be allowed to compile untrusted Pomsky expressions on a server, as this can take forever and exhaust the server\u0026rsquo;s resources.
History # Built-in variables Start, End, Codepoint, Grapheme added in Pomsky 0.4.2 Initial implementation in Pomsky 0.3 `}),e.add({id:40,href:"/docs/examples/",title:"Examples",description:"Get inspiration from real-world examples",content:""}),e.add({id:41,href:"/docs/examples/numbers/",title:"Example: Numbers",description:"Rational numbers in decimal notation with optional separating commas",content:`This regular expression matches rational numbers in decimal notation with optional separating commas:
[-+]??\\b(?:0|[1-9](?:,??[0-9])*)(?:\\.[0-9]+)?\\b Equivalent Pomsky expression:
['-+']? % ('0' | ['1'-'9'] (','? ['0'-'9'])*) ('.' ['0'-'9']+)? % `}),e.add({id:42,href:"/docs/examples/passwords/",title:"Example: Passwords",description:"Test if password satisfies strength requirements",content:`Here\u0026rsquo;s a regular expression that tests if a string contains at least one uppercase letter, lowercase letter, digit and punctuation/symbol code point, and is at least 8 code points long:
^(?=[\\S\\s]*?\\d)(?=[\\S\\s]*?\\p{Ll})(?=[\\S\\s]*?\\p{Lu})(?=[\\S\\s]*?[\\pP\\pS])[\\S\\s]{8} Equivalent Pomsky expression:
^ (\u0026gt;\u0026gt; C* [digit]) (\u0026gt;\u0026gt; C* [Ll]) (\u0026gt;\u0026gt; C* [Lu]) (\u0026gt;\u0026gt; C* [P S]) C{8} `}),e.add({id:43,href:"/docs/examples/java-idents/",title:"Example: Java Identifiers",description:"Pomsky expression matching a Java identifier",content:`Regex matching a Java identifier:
[\\p{Connector_Punctuation}\\p{Currency_Symbol}\\p{Mark}\\p{Alphabetic}][\\p{Connector_Punctuation}\\p{Currency_Symbol}\\p{Mark}\\p{Alphabetic}\\p{Numeric}]* With abbreviations:
[\\p{Pc}\\p{Sc}\\p{M}\\p{Alpha}][\\p{Pc}\\p{Sc}\\p{M}\\p{Alpha}\\p{Numeric}]* And as a Pomsky:
[Pc Sc M Alpha] [Pc Sc M Alpha Numeric]* `}),e.add({id:44,href:"/docs/examples/emails/",title:"Example: Email Addresses",description:"Match an RFC 5322 compliant email address",content:"This StackOverflow answer contains a massive regular expression for matching any RFC 5322 compliant email address:\n(?:[a-z0-9!#$%\u0026amp;'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%\u0026amp;'*+/=?^_`{|}~-]+)*|\u0026quot;(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\u0026quot;)@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\]) If your regex engine supports insiginificant whitespace mode (?x), it can be written like this:\n(?x) (?: [a-z0-9!#$%\u0026amp;'*+/=?^_`{|}~-]+ (?: \\. [a-z0-9!#$%\u0026amp;'*+/=?^_`{|}~-]+ )* | \u0026quot; (?: [\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f] | \\\\ [\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f] )* \u0026quot; ) @ (?: (?: [a-z0-9] (?: [a-z0-9-]* [a-z0-9] )? \\. )+ [a-z0-9] (?: [a-z0-9-]* [a-z0-9] )? | \\[ (?: (?: (2 (5 [0-5] | [0-4] [0-9]) | 1 [0-9] [0-9] | [1-9]? [0-9]) ) \\. ){3} (?: (2 (5 [0-5] | [0-4] [0-9]) | 1 [0-9] [0-9] | [1-9]? [0-9]) | [a-z0-9-]* [a-z0-9] : (?: [\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f] | \\\\ [\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f] )+ ) \\] ) Here\u0026rsquo;s a straightforward translation into Pomsky:\n( | ['a'-'z' '0'-'9' \u0026quot;!#$%\u0026amp;'*+/=?^_`{|}~-\u0026quot;]+ ('.' ['a'-'z' '0'-'9' \u0026quot;!#$%\u0026amp;'*+/=?^_`{|}~-\u0026quot;]+ )* | '\u0026quot;' ( [U+01-U+08 U+0b U+0c U+0e-U+1f U+21 U+23-U+5b U+5d-U+7f] | '\\' [U+01-U+09 U+0b U+0c U+0e-U+7f] )* '\u0026quot;' ) '@' ( | ( ['a'-'z' '0'-'9'] ( ['a'-'z' '0'-'9' '-']* ['a'-'z' '0'-'9'] )? '.' )+ ['a'-'z' '0'-'9'] ( ['a'-'z' '0'-'9' '-']* ['a'-'z' '0'-'9'] )? | '[' (:(range '0'-'255') '.'){3} ( | :(range '0'-'255') | ['a'-'z' '0'-'9' '-']* ['a'-'z' '0'-'9'] ':' ( | [U+01-U+08 U+0b U+0c U+0e-U+1f U+21-U+5a U+53-U+7f] | '\\' [U+01-U+09 U+0b U+0c U+0e-U+7f] )+ ) ']' ) Notice how the complex logic for matching a number between \u0026lsquo;0\u0026rsquo; and \u0026lsquo;255\u0026rsquo; is replaced by a simple range expression in Pomsky.\nWe can also write the above as follows using variables:\nlet before_at = ['a'-'z' '0'-'9' \u0026quot;!#$%\u0026amp;'*+/=?^_`{|}~-\u0026quot;]; let escaped = '\\' [U+01-U+09 U+0b U+0c U+0e-U+7f]; let quoted_before_at = [U+01-U+08 U+0b U+0c U+0e-U+1f U+21 U+23-U+5b U+5d-U+7f]; let port_digit = [U+01-U+08 U+0b U+0c U+0e-U+1f U+21-U+5a U+53-U+7f]; let lower_digit = ['a'-'z' '0'-'9']; let lower_digit_dash = ['a'-'z' '0'-'9' '-']; let domain_label = lower_digit (lower_digit_dash* lower_digit)?; ( | before_at+ ('.' before_at+)* | '\u0026quot;' (quoted_before_at | escaped)* '\u0026quot;' ) '@' ( | (domain_label '.')+ domain_label | '[' (:(range '0'-'255') '.'){3} ( | :(range '0'-'255') | lower_digit_dash* lower_digit ':' (port_digit | escaped)+ ) ']' ) "}),e.add({id:45,href:"/docs/examples/ip-addresses/",title:"Example: IP addresses",description:"Test if a string is a valid IPv4 or IPv6 address",content:"Here\u0026rsquo;s a regular expression that checks if a string is a valid IPv4 or IPv6 address:\n^(([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(:[0-9a-fA-F]{1,4}){1,6}|:(:[0-9a-fA-F]{1,4}){1,7}|::|fe80:(:[0-9a-fA-F]{1,4}){0,4}%[0-9a-zA-Z]+|::(ffff(:0{1,4})?:)?((25[0-5]|(2[0-4]|1?[0-9])?[0-9])\\.){3}(25[0-5]|(2[0-4]|1?[0-9])?[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1?[0-9])?[0-9])\\.){3}(25[0-5]|(2[0-4]|1?[0-9])?[0-9])|((25[0-5]|(2[0-4]|1?[0-9])?[0-9])\\.){3}(25[0-5]|(2[0-4]|1?[0-9])?[0-9]))$ Here\u0026rsquo;s the same regexp in free-spacing mode with some comments added:\n(?x) ^( ( [0-9a-fA-F]{1,4} : ){7} [0-9a-fA-F]{1,4} | ( [0-9a-fA-F]{1,4} : ){1,7} : | ( [0-9a-fA-F]{1,4} : ){1,6} : [0-9a-fA-F]{1,4} | ( [0-9a-fA-F]{1,4} : ){1,5} ( : [0-9a-fA-F]{1,4} ){1,2} | ( [0-9a-fA-F]{1,4} : ){1,4} ( : [0-9a-fA-F]{1,4} ){1,3} | ( [0-9a-fA-F]{1,4} : ){1,3} ( : [0-9a-fA-F]{1,4} ){1,4} | ( [0-9a-fA-F]{1,4} : ){1,2} ( : [0-9a-fA-F]{1,4} ){1,5} | [0-9a-fA-F]{1,4} : ( : [0-9a-fA-F]{1,4} ){1,6} | : ( : [0-9a-fA-F]{1,4} ){1,7} | :: | fe80: ( : [0-9a-fA-F]{1,4} ){0,4} % [0-9a-zA-Z]+ # link-local IPv6 addresses with zone index | :: ( ffff ( : 0{1,4} )? : )? ( ( 25[0-5] | ( 2[0-4] | 1?[0-9] )? [0-9] ) \\.){3} ( 25[0-5] | ( 2[0-4] | 1?[0-9] )? [0-9] ) # IPv4-mapped IPv6 addresses and IPv4-translated addresses | ( [0-9a-fA-F]{1,4} : ){1,4} : ( ( 25[0-5] | ( 2[0-4] | 1?[0-9] )? [0-9] ) \\.){3} ( 25[0-5] | ( 2[0-4] | 1?[0-9] )? [0-9] ) # IPv4-Embedded IPv6 Address | ( ( 25[0-5] | ( 2[0-4] | 1?[0-9] )? [0-9] ) \\.){3} ( 25[0-5] | ( 2[0-4] | 1?[0-9] )? [0-9] ) # IPv4 )$ And the equivalent Pomsky expression:\n# segment in an IPv4 address let num_v4 = range '0'-'255'; # segment in an IPv6 address let num_v6 = [ascii_xdigit]{1,4}; # IPv4 address let ipv4 = (num_v4 '.'){3} num_v4; # link-local IPv6 address with zone index # e.g. fe80::7:8%eth0 fe80::7:8%1 let link_local = 'fe80:' (':' num_v6){0,4} '%' [ascii_alnum]+; # IPv4-Embedded IPv6 Address # e.g. 2001:db8:3:4::192.0.2.33 64:ff9b::192.0.2.33 let ipv4_embedded = (num_v6 ':'){1,4} ':' ipv4; # IPv4-mapped IPv6 address or IPv4-translated address # e.g. ::255.255.255.255 ::ffff:255.255.255.255 ::ffff:0:255.255.255.255 let ipv4_mapped_translated = '::' ('ffff' (':' '0'{1,4})? ':')? ipv4; # IPv6 address let ipv6 = ( | (num_v6 ':'){7} num_v6 | (num_v6 ':'){1,7} ':' | (num_v6 ':'){1,6} (':' num_v6) | (num_v6 ':'){1,5} (':' num_v6){1,2} | (num_v6 ':'){1,4} (':' num_v6){1,3} | (num_v6 ':'){1,3} (':' num_v6){1,4} | (num_v6 ':'){1,2} (':' num_v6){1,5} | (num_v6 ':') (':' num_v6){1,6} | ':' (':' num_v6){1,7} | '::' | link_local | ipv4_mapped_translated | ipv4_embedded ); # IP address ^ (ipv6 | ipv4) $ "}),e.add({id:46,href:"/docs/examples/semver/",title:"Example: Semantic version",description:"Validate a version matches the semver 2.0 spec",content:`Here\u0026rsquo;s a regular expression for parsing a version according to the semver 2.0.0 specification:
(?P\u0026lt;major\u0026gt;\\d+)\\.(?P\u0026lt;minor\u0026gt;\\d+)\\.(?P\u0026lt;patch\u0026gt;\\d+)(?:-(?P\u0026lt;prerelease\u0026gt;[0-9a-zA-Z\\-]+(?:\\.[0-9a-zA-Z\\-]+)*))?(?:\\+(?P\u0026lt;buildmeta\u0026gt;[0-9a-zA-Z\\-]+(?:\\.[0-9a-zA-Z\\-]+)*))? And in free-spacing mode:
(?P\u0026lt;major\u0026gt; \\d+) \\. (?P\u0026lt;minor\u0026gt; \\d+) \\. (?P\u0026lt;patch\u0026gt; \\d+) (?: - (?P\u0026lt;prerelease\u0026gt; [0-9a-zA-Z\\-]+ (?: \\. [0-9a-zA-Z\\-]+ )* ) )? (?: \\+ (?P\u0026lt;buildmeta\u0026gt; [0-9a-zA-Z\\-]+ (?: \\. [0-9a-zA-Z\\-]+ )* ) )? Here\u0026rsquo;s an equivalent Pomsky expression:
let number = [digit]+; let identifier = [ascii_alnum '-']+; let identifiers = identifier ('.' identifier)*; :major(number) '.' :minor(number) '.' :patch(number) ('-' :prerelease(identifiers))? ('+' :buildmeta(identifiers))? `}),e.add({id:47,href:"/docs/appendix/",title:"Appendix",description:"Additional resources",content:""}),e.add({id:48,href:"/docs/appendix/comparison/",title:"Comparison with other projects",description:"See how Pomsky compares to similar projects",content:`This wiki has a list of projects with similar goals to Pomsky. Here\u0026rsquo;s a list of the most popular projects:
⚠️ Disclaimer that as the maintainer of Pomsky, I am obviously biased. If you find any incorrect or misleading information, please open an issue. Project Types GitHub Melody Transpiled Pomsky Transpiled Egg Expressions Transpiled
App: Oil shell Rx Expressions Transpiled
App: Emacs Raku Grammars App: Raku Rosie App: Rosie SRL DSL: PHP Super Expressive DSL: JS Verbal Expressions DSL: JS Swift RegexBuilder DSL: Swift Since this content is likely to get out of date, I encourage you to update it.
Types #Transpiled #These languages are transpiled to \u0026ldquo;normal\u0026rdquo; regular expressions and can therefore be used anywhere. They usually have command-line interface to compile expressions.
Application specific #Some regex languages are specific to a certain application or programming language. For example, Raku grammars can only be used in Raku; egg expressions are transpiled, but they are only available in the Oil shell.
DSLs #DSLs (domain-specific languages) are languages that are embedded in another language using the host language\u0026rsquo;s syntax. For example, Verbal Expressions uses JavaScript methods:
const tester = VerEx() .startOfLine() .then('http') .maybe('s') .then('://') .maybe('www.') .anythingBut(' ') .endOfLine() This page currently only discusses transpiled languages, but I welcome contributions.
Compatibility #Let\u0026rsquo;s see what Regex flavors are supported by transpiled languages.
Flavor Melody Pomsky Egg Expr. Rx Expr. ERE ✅ ✅ ECMAScript ✅ ✅ PCRE ✅* ✅ .NET ✅* ✅ Java ✅* ✅ Ruby ✅* ✅ Python ✅ Rust ✅ RE2 ✅ *Melody can only emit ECMAScript regexes, but they also happen to be compatible with several other flavors.
Explanation of the flavors # ERE (extended regular expressions) are used by tools such as GNU grep and awk. Because ERE supports only the most basic features, it is mostly forward compatible with other regex flavors.
ECMAScript is the syntax used in JavaScript and related languages (TypeScript, Elm, Dart, etc.) that are compiled to JS.
PCRE (an acronym for \u0026ldquo;Perl compatible regular expression\u0026rdquo;) is the syntax used by the PCRE2 regex engine, which is the default in at least Crystal, Delphi, Elixir, Erlang, Hack, Julia, PHP, R, and Vala. It\u0026rsquo;s also a popular choice in other languages like C and C++ and is used in many applications such as the Apache server, nginx, MariaDB, MongoDB, and optionally in GNU grep.
.NET refers to the .NET regular expressions, used by languages such as C# and F#.
Java refers to the Pattern class in the Java standard library. Also used in Kotlin and Scala.
Ruby refers to built-in regular expressions in Ruby (using the oniguruma regex library).
Python refers to Python\u0026rsquo;s re module. Note that Python 3 is required for good Unicode support.
Rust refers to Rust\u0026rsquo;s popular regex crate (used by ripgrep)
RE2 refers to Google\u0026rsquo;s re2 library; this flavor is also compatible with Go\u0026rsquo;s regexp package.
Many more flavors exist, which are not (or only partially) supported by Pomsky and other languages.
Features #Let\u0026rsquo;s see what Regex features are supported by languages that are transpiled to regular expressions.
Basic regex features # Feature Melody Pomsky Egg Expr. Rx Expr. Greedy repetition ✅ ✅ ✅ ✅ Lazy repetition ✅ ✅ ✅ ✅ Dot ✅ ✅ ✅ ✅ Character escape ✅ ✅ ✅ ✅ Character class ✅ ✅ ✅ ✅ Anchor ✅ ✅ ✅ ✅ Word boundary ✅ ✅ ✅ ✅ Negated word boundary ✅ ✅ ✅ ✅ Character range partly* ✅ ✅ ✅ Character set ✅ ✅ ✅ Negated character set partly* ✅ ✅ ✅ Capturing group ✅ ✅ ✅ ✅ Alternation ✅ ✅ ✅ ✅ POSIX class ✅ ✅ ✅ Non-capturing group ✅ ✅ ✅ *Character ranges and negated sets in Melody only support ASCII letters, digits and a few special characters.
Advanced features # Feature Melody Pomsky Egg Expr. Rx Expr. Variable/macro ✅ ✅ ✅ ✅ Line comment ✅ ✅ ✅ ✅ Block comment ✅ Code point ✅ ✅ Lookaround ✅ ✅ Named capturing group ✅ ✅ ✅ Backreference ✅ ✅ Named backreference ✅ Relative backreference ✅ Unicode category ✅ ✅ Unicode script/block ✅ partly Unicode script extensions ✅ Other Unicode property ✅ Any code point partly* ✅ partly* partly* Any grapheme ✅ Atomic group ✅ Character set intersection ✅ ✅ Conditional Recursion ✅ Modifier Inline regex ✅ ✅ Optimization some** Note that Melody and Pomsky support inline regexes. Because of this, all Regex features are technically supported in Melody and Pomsky, but using inline regexes may be less ergonomic and more dangerous to use than properly supported features.
*All languages can match a code point with the dot, if multiline mode is enabled in the regex engine.
**Pomsky can currently
optimize repetitions remove redundant or empty groups in character sets, deduplicate code points and merge overlapping ranges merge single-character alternations into character sets More optimizations are planned.
Tooling # Tool Melody Pomsky Egg Expr. Rx Expr. CLI ✅ ✅ REPL ✅ ✅ Online playground ✅ ✅ VSCode extension ✅ ✅ IntelliJ extension ✅ JavaScript bunder Babel Vite,
Rollup,
ESBuild,
Webpack Rust macro ✅ Linter Formatter Packages # Tool Melody Pomsky Homebrew ✅ ✅ AUR ✅ ✅ Nix ✅ ✅ GitHub release binary (Apple) ✅ ✅ GitHub release binary (Windows) ✅ GitHub release binary (Linux) ✅ Node module ✅ ✅ Python module ✅ IDE features # Feature Melody Pomsky Syntax highlighting ✅ ✅ Error highlighting ✅ Code folding ✅1 ✅1 Auto indentation ✅ ✅ Snippets ✅ ✅ Matching brackets and quotes ✅2 ✅ Keyword autocomplete ✅2 ✅ Variable autocomplete ✅3 Backreference autocomplete Character class autocomplete ✅ Unicode property autocomplete ✅ Hover tooltips Apply suggestions Share link (playground) ✅ ✅ 1 indentation based
2 works in playground VSCode, but not in the playground
3 does not take scopes into account
Found a mistake? Please fix it on GitHub.
`}),e.add({id:49,href:"/docs/appendix/security/",title:"Security",description:"Advice how to use Pomsky securely",content:`If you intend to compile or execute Pomsky expressions on a web server or other critical infrastructure, follow this advice, so you don\u0026rsquo;t end up vulnerable to attacks.
A billion laughs #The most important advice is to never compile an untrusted Pomsky expression, since doing that may make you vulnerable for denial of service attacks. Here\u0026rsquo;s a simple example:
let lol = 'lol'; let lol1 = lol lol lol lol lol lol lol lol lol lol; let lol2 = lol1 lol1 lol1 lol1 lol1 lol1 lol1 lol1 lol1 lol1; let lol3 = lol2 lol2 lol2 lol2 lol2 lol2 lol2 lol2 lol2 lol2; let lol4 = lol3 lol3 lol3 lol3 lol3 lol3 lol3 lol3 lol3 lol3; let lol5 = lol4 lol4 lol4 lol4 lol4 lol4 lol4 lol4 lol4 lol4; let lol6 = lol5 lol5 lol5 lol5 lol5 lol5 lol5 lol5 lol5 lol5; let lol7 = lol6 lol6 lol6 lol6 lol6 lol6 lol6 lol6 lol6 lol6; let lol8 = lol7 lol7 lol7 lol7 lol7 lol7 lol7 lol7 lol7 lol7; let lol9 = lol8 lol8 lol8 lol8 lol8 lol8 lol8 lol8 lol8 lol8; lol9 What does this expression do? It evaluates the variable lol9, which expands to the lol8 variable 10 times, each of which expands to lol7 10 times, and so on. This exploit, called the Billion Laughs attack, produces the word lol a 1,000,000,000 times, which takes full 5 minutes to compile on my laptop.
If you only compile Pomsky expressions you wrote yourself (or someone you trust), this is not a problem, since it is quite unlikely to write something like the above by accident. Furthermore, it is impossible to run into an infinite loop since Pomsky forbids recursive variable declarations.
Backtracking #Just like you shouldn\u0026rsquo;t compile untrusted Pomsky expressions, you also shouldn\u0026rsquo;t execute an untrusted regex. The reason for this is that most regex engines are backtracking, which has (worst-case) exponential runtime performance. A regex created with not enough care or by a bad actor can easily take down a NodeJS server if the server naively matches a large body of text against the regex.
What does this mean for Pomsky? Unless you use RE2 or Rust\u0026rsquo;s regex crate (which never backtrack), Pomsky expressions are just as susceptible to catastrophic backtracking as hand-written regexes. Therefore, don\u0026rsquo;t execute untrusted regexes on critical infrastructure.
A million ranges #Another thing to watch out for are range expressions: Since the complexity of compiling range expressions is exponential, compiling large ranges can take unusually long:
# all unsigned 64-bit integers range '0'-'18446744073709551616' The above Pomsky takes 70 milliseconds on my laptop to compile. But add 4 digits and it\u0026rsquo;s over a second. This is already remedied by default, since ranges can by default be at most 6 digits long, or 12 digits in the CLI. Be careful if you override this default.
This limit is not sufficient, though: It is easy to generate an expression containing a million ranges each with 6 digits. This takes 45 seconds to compile on my laptop.
Hardening Pomsky #If you intend to compile Pomsky expressions on a web server, but not execute the resulting regex, there are a few things you can do to stay safe:
Disable the range feature and variables Limit the length of the Pomsky. Limit the number of HTTP requests a user can make per minute Run the Pomsky compiler in a separate thread and stop the thread if Pomsky doesn\u0026rsquo;t complete in a certain time frame I make no guarantees whether these suggestions are sufficient to protect your service.
`}),e.add({id:50,href:"/docs/appendix/unicode-properties/",title:"Unicode properties",description:"Exhaustive list of Unicode general categories, scripts, blocks and other properties supported by Pomsky",content:`Pomsky supports the following kinds of Unicode properties:
General categories Scripts Script extensions Blocks Other boolean properties However, not all regex engines support all of them. In particular, blocks and other properties are poorly supported.
Prefixes #You may add a prefix to properties to indicate what kind of property it is:
gc: or general_category: indicates a general category script: or sc: indicates a script script_extensions: or scx: indicates script extensions block: or blk: indicates a block [gc:Letter] # the \u0026quot;Letter\u0026quot; general category [scx:Greek] # the \u0026quot;Greek\u0026quot; script extensions Except for blocks, these prefixes are optional, so you can write [Letter] and [Greek] if you prefer.
General Categories #Every Unicode code point is in one of the following General Categories:
Letter Mark Number Punctuation Symbol Separator Other Each of these categories is subdivided into smaller categories. More information on Wikipedia.
In Pomsky, you can match against categories in square brackets:
[Uppercase_Letter Mark] Using the abbreviations, the above can be written as
[Lu M] Show all 38 categories Abbr Long Description Lu Uppercase_Letter an uppercase letter Ll Lowercase_Letter a lowercase letter Lt Titlecase_Letter a digraphic character (e.g. ‘Dž’), first part uppercase LC Cased_Letter Lu | Ll | Lt Lm Modifier_Letter a modifier letter Lo Other_Letter other letters, including syllables and ideographs L Letter Lu | Ll | Lt | Lm | Lo Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) Mc Spacing_Mark a spacing combining mark (positive advance width) Me Enclosing_Mark an enclosing combining mark M Mark Mn | Mc | Me Nd Decimal_Number a decimal digit Nl Letter_Number a letterlike numeric character No Other_Number a numeric character of other type N Number Nd | Nl | No Pc Connector_Punctuation a connecting punctuation mark, like a tie Pd Dash_Punctuation a dash or hyphen punctuation mark Ps Open_Punctuation an opening punctuation mark (of a pair) Pe Close_Punctuation a closing punctuation mark (of a pair) Pi Initial_Punctuation an initial quotation mark Pf Final_Punctuation a final quotation mark Po Other_Punctuation a punctuation mark of other type P Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po Sm Math_Symbol a symbol of mathematical use Sc Currency_Symbol a currency sign Sk Modifier_Symbol a non-letterlike modifier symbol So Other_Symbol a symbol of other type S Symbol Sm | Sc | Sk | So Zs Space_Separator a space character (of various non-zero widths) Zl Line_Separator U+2028 LINE SEPARATOR only Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only Z Separator Zs | Zl | Zp Cc Control a C0 or C1 control code Cf Format a format control character Cs Surrogate a surrogate code point
⚠️ not supported in Rust Co Private_Use a private-use character Cn Unassigned a reserved unassigned code point or a noncharacter C Other Cc | Cf | Cs | Co | Cn Support # PCRE JS Java Ruby Rust .NET Python RE2 ✅ ✅ ✅ ✅ ✅ ✅ ⛔ ✅ Rust does not support the Surrogate category, because it is always Unicode aware and UTF-16 surrogates are not valid Unicode scalar values.
Scripts #A script is a collection of code points used to represent textual information in one or more writing systems.
As with categories, code points can only be assigned to a single script. Code points used in multiple scripts are therefore assigned to the special Common or Inherited scripts. More information on Wikipedia.
Show all 164 scripts Abbr Long / Notes Adlm Adlam Aghb Caucasian_Albanian Ahom Ahom Arab Arabic Armi Imperial_Aramaic Armn Armenian Avst Avestan Bali Balinese Bamu Bamum Bass Bassa_Vah Batk Batak Beng Bengali Bhks Bhaiksuki Bopo Bopomofo Brah Brahmi Brai Braille Bugi Buginese Buhd Buhid Cakm Chakma Cans Canadian_Aboriginal Cari Carian Cham Cham Cher Cherokee Chrs Chorasmian Copt Coptic, Qaac Cpmn Cypro_Minoan Cprt Cypriot Cyrl Cyrillic Deva Devanagari Diak Dives_Akuru Dogr Dogra Dsrt Deseret Dupl Duployan Egyp Egyptian_Hieroglyphs Elba Elbasan Elym Elymaic Ethi Ethiopic Geor Georgian Glag Glagolitic Gong Gunjala_Gondi Gonm Masaram_Gondi Goth Gothic Gran Grantha Grek Greek Gujr Gujarati Guru Gurmukhi Hang Hangul Hani Han Hano Hanunoo Hatr Hatran Hebr Hebrew Hira Hiragana Hluw Anatolian_Hieroglyphs Hmng Pahawh_Hmong Hmnp Nyiakeng_Puachue_Hmong Hung Old_Hungarian Ital Old_Italic Java Javanese Kali Kayah_Li Kana Katakana Kawi Kawi Khar Kharoshthi Khmr Khmer Khoj Khojki Kits Khitan_Small_Script Knda Kannada Kthi Kaithi Lana Tai_Tham Laoo Lao Latn Latin Lepc Lepcha Limb Limbu Lina Linear_A Linb Linear_B Lisu Lisu Lyci Lycian Lydi Lydian Mahj Mahajani Maka Makasar Mand Mandaic Mani Manichaean Marc Marchen Medf Medefaidrin Mend Mende_Kikakui Merc Meroitic_Cursive Mero Meroitic_Hieroglyphs Mlym Malayalam Modi Modi Mong Mongolian Mroo Mro Mtei Meetei_Mayek Mult Multani Mymr Myanmar Nagm Nag_Mundari Nand Nandinagari Narb Old_North_Arabian Nbat Nabataean Newa Newa Nkoo Nko Nshu Nushu Ogam Ogham Olck Ol_Chiki Orkh Old_Turkic Orya Oriya Osge Osage Osma Osmanya Ougr Old_Uyghur Palm Palmyrene Pauc Pau_Cin_Hau Perm Old_Permic Phag Phags_Pa Phli Inscriptional_Pahlavi Phlp Psalter_Pahlavi Phnx Phoenician Plrd Miao Prti Inscriptional_Parthian Rjng Rejang Rohg Hanifi_Rohingya Runr Runic Samr Samaritan Sarb Old_South_Arabian Saur Saurashtra Sgnw SignWriting Shaw Shavian Shrd Sharada Sidd Siddham Sind Khudawadi Sinh Sinhala Sogd Sogdian Sogo Old_Sogdian Sora Sora_Sompeng Soyo Soyombo Sund Sundanese Sylo Syloti_Nagri Syrc Syriac Tagb Tagbanwa Takr Takri Tale Tai_Le Talu New_Tai_Lue Taml Tamil Tang Tangut Tavt Tai_Viet Telu Telugu Tfng Tifinagh Tglg Tagalog Thaa Thaana Thai Thai Tibt Tibetan Tirh Tirhuta Tnsa Tangsa Toto Toto Ugar Ugaritic Vaii Vai Vith Vithkuqi Wara Warang_Citi Wcho Wancho Xpeo Old_Persian Xsux Cuneiform Yezi Yezidi Yiii Yi Zanb Zanabazar_Square Zinh Inherited Zyyy Common Zzzz Unknown
⚠️ not supported by Rust Support # PCRE JS Java Ruby Rust .NET Python RE2 ✅ ✅ ✅ ✅ ✅ ⛔ ⛔ ✅ Zzzz (Unknown) is not supported in Rust.
Script Extensions #Script extensions are similar to scripts; the difference is that script extensions can overlap, whereas scripts can not.
For example, the code point U+064B is used both in the Arab and Syriac script, so it is matched by both [scx:Arab] and [scx:Syriac]. If your regex flavor supports script extensions, they should almost always be preferred over scripts.
The list of script extensions is the same as the list of scripts.
Support # PCRE JS Java Ruby Rust .NET Python RE2 ✅ ✅ ⛔ ⛔ ✅ ⛔ ⛔ ⛔ Blocks #The Unicode character set is divided into blocks of consecutive code points that usually belong to the same script or serve a similar purpose.
There are often multiple blocks for a script. For example, there are 10 designated blocks for Latin code points: Basic_Latin, Latin_1_Supplement, Latin_Extended_Additional, and Latin_Extended_A through Latin_Extended_G. Furthermore, many blocks contain two or more scripts, which is not always clear from the name. For example, Latin_Extended_E includes a Greek code point.
Therefore, it is almost always better to use the script rather than the block, but Pomsky still supports blocks.
Blocks have to be prefixed with blk: or block:. Originally, blocks were prefixed with In, but this might be deprecated in the future:
# matches code points in the \`Basic_Latin\` block: [blk:Basic_Latin] # equivalent, but not recommended: [InBasic_Latin] Show all 328 blocks Names Adlam Aegean_Numbers Ahom Alchemical, Alchemical_Symbols Alphabetic_PF, Alphabetic_Presentation_Forms Anatolian_Hieroglyphs Ancient_Greek_Music, Ancient_Greek_Musical_Notation Ancient_Greek_Numbers Ancient_Symbols Arabic Arabic_Ext_A, Arabic_Extended_A Arabic_Ext_B, Arabic_Extended_B Arabic_Ext_C, Arabic_Extended_C Arabic_Math, Arabic_Mathematical_Alphabetic_Symbols Arabic_PF_A, Arabic_Presentation_Forms_A Arabic_PF_B, Arabic_Presentation_Forms_B Arabic_Sup, Arabic_Supplement Armenian Arrows ASCII, Basic_Latin Avestan Balinese Bamum Bamum_Sup, Bamum_Supplement Bassa_Vah Batak Bengali Bhaiksuki Block_Elements Bopomofo Bopomofo_Ext, Bopomofo_Extended Box_Drawing Brahmi Braille, Braille_Patterns Buginese Buhid Byzantine_Music, Byzantine_Musical_Symbols Carian Caucasian_Albanian Chakma Cham Cherokee Cherokee_Sup, Cherokee_Supplement Chess_Symbols Chorasmian CJK, CJK_Unified_Ideographs CJK_Compat, CJK_Compatibility CJK_Compat_Forms, CJK_Compatibility_Forms CJK_Compat_Ideographs, CJK_Compatibility_Ideographs CJK_Compat_Ideographs_Sup, CJK_Compatibility_Ideographs_Supplement CJK_Ext_A, CJK_Unified_Ideographs_Extension_A CJK_Ext_B, CJK_Unified_Ideographs_Extension_B CJK_Ext_C, CJK_Unified_Ideographs_Extension_C CJK_Ext_D, CJK_Unified_Ideographs_Extension_D CJK_Ext_E, CJK_Unified_Ideographs_Extension_E CJK_Ext_F, CJK_Unified_Ideographs_Extension_F CJK_Ext_G, CJK_Unified_Ideographs_Extension_G CJK_Ext_H, CJK_Unified_Ideographs_Extension_H CJK_Radicals_Sup, CJK_Radicals_Supplement CJK_Strokes CJK_Symbols, CJK_Symbols_And_Punctuation Compat_Jamo, Hangul_Compatibility_Jamo Control_Pictures Coptic Coptic_Epact_Numbers Counting_Rod, Counting_Rod_Numerals Cuneiform Cuneiform_Numbers, Cuneiform_Numbers_And_Punctuation Currency_Symbols Cypriot_Syllabary Cypro_Minoan Cyrillic Cyrillic_Ext_A, Cyrillic_Extended_A Cyrillic_Ext_B, Cyrillic_Extended_B Cyrillic_Ext_C, Cyrillic_Extended_C Cyrillic_Ext_D, Cyrillic_Extended_D Cyrillic_Sup, Cyrillic_Supplement, Cyrillic_Supplementary Deseret Devanagari Devanagari_Ext, Devanagari_Extended Devanagari_Ext_A, Devanagari_Extended_A Diacriticals, Combining_Diacritical_Marks Diacriticals_Ext, Combining_Diacritical_Marks_Extended Diacriticals_For_Symbols, Combining_Diacritical_Marks_For_Symbols, Combining_Marks_For_Symbols Diacriticals_Sup, Combining_Diacritical_Marks_Supplement Dingbats Dives_Akuru Dogra Domino, Domino_Tiles Duployan Early_Dynastic_Cuneiform Egyptian_Hieroglyph_Format_Controls Egyptian_Hieroglyphs Elbasan Elymaic Emoticons Enclosed_Alphanum, Enclosed_Alphanumerics Enclosed_Alphanum_Sup, Enclosed_Alphanumeric_Supplement Enclosed_CJK, Enclosed_CJK_Letters_And_Months Enclosed_Ideographic_Sup, Enclosed_Ideographic_Supplement Ethiopic Ethiopic_Ext, Ethiopic_Extended Ethiopic_Ext_A, Ethiopic_Extended_A Ethiopic_Ext_B, Ethiopic_Extended_B Ethiopic_Sup, Ethiopic_Supplement Geometric_Shapes Geometric_Shapes_Ext, Geometric_Shapes_Extended Georgian Georgian_Ext, Georgian_Extended Georgian_Sup, Georgian_Supplement Glagolitic Glagolitic_Sup, Glagolitic_Supplement Gothic Grantha Greek, Greek_And_Coptic Greek_Ext, Greek_Extended Gujarati Gunjala_Gondi Gurmukhi Half_And_Full_Forms, Halfwidth_And_Fullwidth_Forms Half_Marks, Combining_Half_Marks Hangul, Hangul_Syllables Hanifi_Rohingya Hanunoo Hatran Hebrew High_PU_Surrogates, High_Private_Use_Surrogates High_Surrogates Hiragana IDC, Ideographic_Description_Characters Ideographic_Symbols, Ideographic_Symbols_And_Punctuation Imperial_Aramaic Indic_Number_Forms, Common_Indic_Number_Forms Indic_Siyaq_Numbers Inscriptional_Pahlavi Inscriptional_Parthian IPA_Ext, IPA_Extensions Jamo, Hangul_Jamo Jamo_Ext_A, Hangul_Jamo_Extended_A Jamo_Ext_B, Hangul_Jamo_Extended_B Javanese Kaithi Kaktovik_Numerals Kana_Ext_A, Kana_Extended_A Kana_Ext_B, Kana_Extended_B Kana_Sup, Kana_Supplement Kanbun Kangxi, Kangxi_Radicals Kannada Katakana Katakana_Ext, Katakana_Phonetic_Extensions Kawi Kayah_Li Kharoshthi Khitan_Small_Script Khmer Khmer_Symbols Khojki Khudawadi Lao Latin_1_Sup, Latin_1_Supplement , Latin_1 Latin_Ext_A, Latin_Extended_A Latin_Ext_Additional, Latin_Extended_Additional Latin_Ext_B, Latin_Extended_B Latin_Ext_C, Latin_Extended_C Latin_Ext_D, Latin_Extended_D Latin_Ext_E, Latin_Extended_E Latin_Ext_F, Latin_Extended_F Latin_Ext_G, Latin_Extended_G Lepcha Letterlike_Symbols Limbu Linear_A Linear_B_Ideograms Linear_B_Syllabary Lisu Lisu_Sup, Lisu_Supplement Low_Surrogates Lycian Lydian Mahajani Mahjong, Mahjong_Tiles Makasar Malayalam Mandaic Manichaean Marchen Masaram_Gondi Math_Alphanum, Mathematical_Alphanumeric_Symbols Math_Operators, Mathematical_Operators Mayan_Numerals Medefaidrin Meetei_Mayek Meetei_Mayek_Ext, Meetei_Mayek_Extensions Mende_Kikakui Meroitic_Cursive Meroitic_Hieroglyphs Miao Misc_Arrows, Miscellaneous_Symbols_And_Arrows Misc_Math_Symbols_A, Miscellaneous_Mathematical_Symbols_A Misc_Math_Symbols_B, Miscellaneous_Mathematical_Symbols_B Misc_Pictographs, Miscellaneous_Symbols_And_Pictographs Misc_Symbols, Miscellaneous_Symbols Misc_Technical, Miscellaneous_Technical Modi Modifier_Letters, Spacing_Modifier_Letters Modifier_Tone_Letters Mongolian Mongolian_Sup, Mongolian_Supplement Mro Multani Music, Musical_Symbols Myanmar Myanmar_Ext_A, Myanmar_Extended_A Myanmar_Ext_B, Myanmar_Extended_B Nabataean Nag_Mundari Nandinagari NB, No_Block New_Tai_Lue Newa NKo Number_Forms Nushu Nyiakeng_Puachue_Hmong OCR, Optical_Character_Recognition Ogham Ol_Chiki Old_Hungarian Old_Italic Old_North_Arabian Old_Permic Old_Persian Old_Sogdian Old_South_Arabian Old_Turkic Old_Uyghur Oriya Ornamental_Dingbats Osage Osmanya Ottoman_Siyaq_Numbers Pahawh_Hmong Palmyrene Pau_Cin_Hau Phags_Pa Phaistos, Phaistos_Disc Phoenician Phonetic_Ext, Phonetic_Extensions Phonetic_Ext_Sup, Phonetic_Extensions_Supplement Playing_Cards Psalter_Pahlavi PUA, Private_Use_Area, Private_Use Punctuation, General_Punctuation Rejang Rumi, Rumi_Numeral_Symbols Runic Samaritan Saurashtra Sharada Shavian Shorthand_Format_Controls Siddham Sinhala Sinhala_Archaic_Numbers Small_Forms, Small_Form_Variants Small_Kana_Ext, Small_Kana_Extension Sogdian Sora_Sompeng Soyombo Specials Sundanese Sundanese_Sup, Sundanese_Supplement Sup_Arrows_A, Supplemental_Arrows_A Sup_Arrows_B, Supplemental_Arrows_B Sup_Arrows_C, Supplemental_Arrows_C Sup_Math_Operators, Supplemental_Mathematical_Operators Sup_PUA_A, Supplementary_Private_Use_Area_A Sup_PUA_B, Supplementary_Private_Use_Area_B Sup_Punctuation, Supplemental_Punctuation Sup_Symbols_And_Pictographs, Supplemental_Symbols_And_Pictographs Super_And_Sub, Superscripts_And_Subscripts Sutton_SignWriting Syloti_Nagri Symbols_And_Pictographs_Ext_A, Symbols_And_Pictographs_Extended_A Symbols_For_Legacy_Computing Syriac Syriac_Sup, Syriac_Supplement Tagalog Tagbanwa Tags Tai_Le Tai_Tham Tai_Viet Tai_Xuan_Jing, Tai_Xuan_Jing_Symbols Takri Tamil Tamil_Sup, Tamil_Supplement Tangsa Tangut Tangut_Components Tangut_Sup, Tangut_Supplement Telugu Thaana Thai Tibetan Tifinagh Tirhuta Toto Transport_And_Map, Transport_And_Map_Symbols UCAS, Unified_Canadian_Aboriginal_Syllabics, Canadian_Syllabics UCAS_Ext, Unified_Canadian_Aboriginal_Syllabics_Extended UCAS_Ext_A, Unified_Canadian_Aboriginal_Syllabics_Extended_A Ugaritic Vai Vedic_Ext, Vedic_Extensions Vertical_Forms Vithkuqi VS, Variation_Selectors VS_Sup, Variation_Selectors_Supplement Wancho Warang_Citi Yezidi Yi_Radicals Yi_Syllables Yijing, Yijing_Hexagram_Symbols Zanabazar_Square Znamenny_Music, Znamenny_Musical_Notation Support # PCRE JS Java Ruby Rust .NET Python RE2 ⛔ ⛔ ✅ ✅ ⛔ (✅) ⛔ ⛔ Java doesn\u0026rsquo;t support the block No_Block.
Ruby doesn\u0026rsquo;t support the following blocks:
Arabic_Extended_C CJK_Unified_Ideographs_Extension_H Cyrillic_Extended_D Devanagari_Extended_A Kaktovik_Numerals .NET only supports blocks in the Basic Multilingual Plane (BMP). And even in the BMP, not all blocks are supported:
Show 108 blocks supported by .NET Alphabetic_Presentation_Forms Arabic Arabic_PresentationForms_A Arabic_PresentationForms_B Armenian Arrows Basic_Latin Bengali Block_Elements Bopomofo Bopomofo_Extended Box_Drawing Braille_Patterns Buhid CJK_Compatibility CJK_Compatibility_Forms CJK_Compatibility_Ideographs CJK_Radicals_Supplement CJK_Symbols_And_Punctuation CJK_Unified_Ideographs CJK_Unified_Ideographs_Extension_A Cherokee Combining_Diacritical_Marks Combining_Diacritical_Marks_For_Symbols Combining_Half_Marks Combining_Marks_For_Symbols Control_Pictures Currency_Symbols Cyrillic Cyrillic_Supplement Devanagari Dingbats Enclosed_Alphanumerics Enclosed_CJK_Letters_And_Months Ethiopic General_Punctuation Geometric_Shapes Georgian Greek Greek_Extended Greek_And_Coptic Gujarati Gurmukhi Halfwidth_And_Fullwidth_Forms Hangul_Compatibility_Jamo Hangul_Jamo Hangul_Syllables Hanunoo Hebrew High_Private_Use_Surrogates High_Surrogates Hiragana IPA_Extensions Ideographic_Description_Characters Kanbun Kangxi_Radicals Kannada Katakana Katakana_Phonetic_Extensions Khmer Khmer_Symbols Lao Latin_1_Supplement Latin_Extended_A Latin_Extended_B Latin_Extended_Additional Letterlike_Symbols Limbu Low_Surrogates Malayalam Mathematical_Operators Miscellaneous_Mathematical_Symbols_A Miscellaneous_Mathematical_Symbols_B Miscellaneous_Symbols Miscellaneous_Symbols_And_Arrows Miscellaneous_Technical Mongolian Myanmar Number_Forms Ogham Optical_Character_Recognition Oriya Phonetic_Extensions Private_Use Private_Use_Area Runic Sinhala Small_Form_Variants Spacing_Modifier_Letters Specials Superscripts_And_Subscripts Supplemental_Arrows_A Supplemental_Arrows_B Supplemental_Mathematical_Operators Syriac Tagalog Tagbanwa TaiLe Tamil Telugu Thaana Thai Tibetan Unified_Canadian_Aboriginal_Syllabics Variation_Selectors Yi_Radicals Yi_Syllables Yijing_Hexagram_Symbols Boolean properties #There are a number of boolean properties (meaning they are either Yes or No), which you can use in Pomsky by simply putting them in square brackets:
# match code points with Diacritic=Yes [Diacritic] Show all 53 boolean properties Abbr Long ASCII ASCII AHex ASCII_Hex_Digit Alpha Alphabetic Any Any Assigned Assigned
⚠️ not supported in PCRE Bidi_C Bidi_Control Bidi_M Bidi_Mirrored
⚠️ not supported in Ruby CI Case_Ignorable Cased Cased CWCF Changes_When_Casefolded
⚠️ not supported in PCRE, Ruby, and Rust CWCM Changes_When_Casemapped CWL Changes_When_Lowercased CWKCF Changes_When_NFKC_Casefolded CWT Changes_When_Titlecased CWU Changes_When_Uppercased Dash Dash DI Default_Ignorable_Code_Point Dep Deprecated Dia Diacritic Emoji Emoji EComp Emoji_Component EMod Emoji_Modifier EBase Emoji_Modifier_Base EPres Emoji_Presentation ExtPict Extended_Pictographic Ext Extender Gr_Base Grapheme_Base Gr_Ext Grapheme_Extend Hex Hex_Digit IDSB IDS_Binary_Operator IDST IDS_Trinary_Operator IDC ID_Continue IDS ID_Start Ideo Ideographic Join_C Join_Control LOE Logical_Order_Exception Lower Lowercase Math Math NChar Noncharacter_Code_Point Pat_Syn Pattern_Syntax Pat_WS Pattern_White_Space QMark Quotation_Mark Radical Radical RI Regional_Indicator STerm Sentence_Terminal SD Soft_Dotted Term Terminal_Punctuation UIdeo Unified_Ideograph Upper Uppercase VS Variation_Selector space White_Space XIDC XID_Continue XIDS XID_Start Support # PCRE JS Java Ruby Rust .NET Python RE2 ✅ ✅ (✅) ✅ ✅ ⛔ ⛔ ⛔ PCRE, Ruby, and Rust don\u0026rsquo;t support all properties – see the list.
Java only supports the following 20 boolean properties:
Alphabetic Ideographic Letter Lowercase Uppercase Titlecase Punctuation Control White_Space Digit Hex_Digit Join_Control Noncharacter_Code_Point Assigned Emoji Emoji_Presentation Emoji_Modifier Emoji_Modifier_Base Emoji_Component Extended_Pictographic `}),e.add({id:51,href:"/docs/reference/comparison/",title:"Comparison (moved)",description:"See how Pomsky compares to similar projects",content:`This page was moved here.
`}),e.add({id:52,href:"/docs/reference/security/",title:"Security (moved)",description:"Advice how to use Pomsky securely",content:`This page was moved here.
`}),e.add({id:53,href:"/docs/reference/built-in-variables/",title:"Built-in variables (moved)",description:"Variables provided by Pomsky out of the box",content:`This page was moved here.
`}),e.add({id:54,href:"/docs/reference/unicode-properties/",title:"Unicode properties (moved)",description:"Exhaustive list of Unicode general categories, scripts, blocks and other properties supported by Pomsky",content:`This page was moved here.
`}),e.add({id:55,href:"/docs/",title:"Docs",description:"Pomsky documentation.",content:` Get Started Start your journey with Pomsky, the next-level regular expression language. Language Tour Introduction to matching text with Pomsky. No prior knowledge required. Examples See real-world Pomsky expressions, and how they compare to traditional regexes. Reference Comprehensive technical documentation about the language. Appendix Additional resources All pages:
`}),search.addEventListener("input",t,!0);function t(){const s=5;var n=this.value,o=e.search(n,{limit:s,enrich:!0});const t=new Map;for(const e of o.flatMap(e=>e.result)){if(t.has(e.doc.href))continue;t.set(e.doc.href,e.doc)}if(suggestions.innerHTML="",suggestions.classList.remove("d-none"),t.size===0&&n){const e=document.createElement("div");e.innerHTML=`No results for "<strong>${n}</strong>"`,e.classList.add("suggestion__no-results"),suggestions.appendChild(e);return}for(const[r,a]of t){const n=document.createElement("div");suggestions.appendChild(n);const e=document.createElement("a");e.href=r,n.appendChild(e);const o=document.createElement("span");o.textContent=a.title,o.classList.add("suggestion__title"),e.appendChild(o);const i=document.createElement("span");if(i.textContent=a.description,i.classList.add("suggestion__description"),e.appendChild(i),suggestions.appendChild(n),suggestions.childElementCount==s)break}}})()