Skip to content

Commit 4d0f52d

Browse files
authored
Merge pull request #4 from stof/unicode_columns
Count Unicode codepoints to compute columns
2 parents 21e3f4c + 67ae68d commit 4d0f52d

File tree

4 files changed

+37
-3
lines changed

4 files changed

+37
-3
lines changed

composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
},
2121
"require": {
2222
"php": ">=8.1",
23+
"ext-mbstring": "*",
2324
"league/uri": "^7.6",
2425
"league/uri-interfaces": "^7.6"
2526
},

src/SimpleSourceLocation.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ final class SimpleSourceLocation extends SourceLocationMixin
1212
/**
1313
* Creates a new location indicating $offset within $sourceUrl.
1414
*
15-
* $line and $column default to assuming the source is a single line. This
15+
* $line and $column default to assuming the source is a single ASCII line. This
1616
* means that $line defaults to 0 and $column defaults to $offset.
1717
*/
1818
public function __construct(

src/SourceFile.php

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,12 +212,15 @@ private function binarySearch(int $offset): int
212212

213213
/**
214214
* The 0-based column of that offset.
215+
*
216+
* Unlike offsets (which are byte-offsets), columns are computed based on Unicode
217+
* codepoints to provide a better experience.
215218
*/
216219
public function getColumn(int $offset): int
217220
{
218221
$line = $this->getLine($offset);
219222

220-
return $offset - $this->lineStarts[$line];
223+
return mb_strlen(substr($this->string, $this->lineStarts[$line], $offset - $this->lineStarts[$line]), 'UTF-8');
221224
}
222225

223226
/**
@@ -237,7 +240,17 @@ public function getOffset(int $line, int $column = 0): int
237240
throw new \OutOfRangeException('Column may not be negative.');
238241
}
239242

240-
$result = $this->lineStarts[$line] + $column;
243+
if ($column === 0) {
244+
$result = $this->lineStarts[$line];
245+
} else {
246+
$lineContent = substr($this->string, $this->lineStarts[$line], $this->lineStarts[$line + 1] ?? null);
247+
248+
if ($column > mb_strlen($lineContent, 'UTF-8')) {
249+
throw new \OutOfRangeException("Line $line doesn't have $column columns.");
250+
}
251+
252+
$result = $this->lineStarts[$line] + \strlen(mb_substr($lineContent, 0, $column, 'UTF-8'));
253+
}
241254

242255
if ($result > \strlen($this->string) || ($line + 1 < \count($this->lineStarts) && $result >= $this->lineStarts[$line + 1])) {
243256
throw new \OutOfRangeException("Line $line doesn't have $column columns.");

tests/SourceFileTest.php

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,4 +254,24 @@ public function testGetTextEndDefaultsToTheEndOfTheFile(): void
254254
{
255255
self::assertEquals("g boom\nzip zap zop", $this->file->getText(20));
256256
}
257+
258+
public function testGetColumnCountsUnicodeCharacters(): void
259+
{
260+
$file = SourceFile::fromString("foo\nbar éà\nbaz");
261+
262+
self::assertEquals(4, $file->getColumn(8));
263+
self::assertEquals(5, $file->getColumn(10));
264+
self::assertEquals(6, $file->getColumn(12));
265+
self::assertEquals(0, $file->getColumn(13));
266+
}
267+
268+
public function testGetOffsetCountsUnicodeCharactersForColumns(): void
269+
{
270+
$file = SourceFile::fromString("foo\nbar éà\nbaz");
271+
272+
self::assertEquals(8, $file->getOffset(1, 4));
273+
self::assertEquals(10, $file->getOffset(1, 5));
274+
self::assertEquals(12, $file->getOffset(1, 6));
275+
self::assertEquals(13, $file->getOffset(2, 0));
276+
}
257277
}

0 commit comments

Comments
 (0)