diff --git a/README.md b/README.md index f36cad9..cd8f4a5 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,16 @@ -# ocaml-vlq +# OCaml VLQ -A simple library for encoding [variable-length quantities](https://en.wikipedia.org/wiki/Variable-length_quantity). +A simple library for encoding and decoding [variable-length-quantities](https://en.wikipedia.org/wiki/Variable-length_quantity). It currently supports writing a base64-encoded integer to a `Buffer`. Patches implementing other forms of encoding are welcome! +## Installation + +### OPAM + +```bash +opam install vlq +``` ## Example diff --git a/src/vlq.ml b/src/vlq.ml index 11e354c..67a95a0 100644 --- a/src/vlq.ml +++ b/src/vlq.ml @@ -5,68 +5,64 @@ * LICENSE file in the root directory of this source tree. *) -(* VLQ (variable-length quantity) encoder - https://en.wikipedia.org/wiki/Variable-length_quantity *) - module type Config = sig - val shift: int - val char_of_digit: int -> char - val digit_of_char: char -> int + val shift : int + val char_of_int : int -> char + val int_of_char : char -> int end module type S = sig - val encode: Buffer.t -> int -> unit - val decode: char Stream.t -> int + val encode : int -> string + val decode : string -> int end exception Unexpected_eof -exception Invalid_base64 of char +exception Char_of_int_failure of int +exception Int_of_char_failure of char module Make (C: Config) = struct let vlq_base = 1 lsl C.shift let vlq_base_mask = vlq_base - 1 - let vlq_continuation_bit = vlq_base (* MSB *) + let vlq_continuation_bit = vlq_base - (** - * Converts from a two-complement value to a value where the sign bit is - * placed in the least significant bit. For example, as decimals: - * 1 becomes 2 (10 binary), -1 becomes 3 (11 binary) - * 2 becomes 4 (100 binary), -2 becomes 5 (101 binary) - *) + (** Converts from a two-complement value to a value where the sign bit is + placed in the least significant bit. For example, as decimals: + 1 becomes 2 (10 binary), -1 becomes 3 (11 binary) + 2 becomes 4 (100 binary), -2 becomes 5 (101 binary) *) let vlq_signed_of_int value = - if value < 0 then ((-value) lsl 1) + 1 else (value lsl 1) + 0 - - (* Write the value to the buffer, as multiple characters as necessary *) - let rec encode_vlq buf vlq = - let digit = vlq land vlq_base_mask in - let vlq = vlq lsr C.shift in - if vlq = 0 then Buffer.add_char buf (C.char_of_digit digit) - else begin - (* set the continuation bit *) - Buffer.add_char buf (C.char_of_digit (digit lor vlq_continuation_bit)); - encode_vlq buf vlq - end + match value < 0 with + | true -> ((-value) lsl 1) + 1 + | false -> value lsl 1 - (* Encodes `value` as a VLQ and writes it to `buf` *) - let encode buf value = + let encode value = let vlq = vlq_signed_of_int value in - encode_vlq buf vlq + let rec loop vlq encoded = + let digit = vlq land vlq_base_mask in + let vlq = vlq lsr C.shift in + match vlq = 0 with + | true -> encoded ^ Char.escaped (C.char_of_int digit) + | false -> + loop vlq (encoded ^ Char.escaped + (C.char_of_int (digit lor vlq_continuation_bit))) in + loop vlq "" - let decode = - let rec helper (acc, shift) stream = + let decode value = + let stream = Stream.of_string value in + let rec loop shift decoded = let chr = try Stream.next stream - with Stream.Failure -> raise Unexpected_eof - in - let digit = C.digit_of_char chr in - let continued = (digit land vlq_continuation_bit) != 0 in - let acc = acc + (digit land vlq_base_mask) lsl shift in - if continued then helper (acc, shift + C.shift) stream else acc - in - fun stream -> - let acc = helper (0, 0) stream in - let abs = acc / 2 in - if acc land 1 = 0 then abs else -(abs) + with Stream.Failure -> raise Unexpected_eof in + let digit = C.int_of_char chr in + let decoded = decoded + (digit land vlq_base_mask) lsl shift in + match digit land vlq_continuation_bit with + | 0 -> decoded + | _ -> (* Continuation found *) + loop (shift + C.shift) decoded in + let decoded = loop 0 0 in + let abs = decoded / 2 in + match decoded land 1 with + | 0 -> abs + | _ -> -(abs) end module Base64 = Make (struct @@ -74,12 +70,13 @@ module Base64 = Make (struct let base64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" (* Convert a number between 0 and 63 to a base64 char *) - let char_of_digit digit = - if 0 <= digit && digit < String.length base64 - then base64.[digit] - else failwith (Printf.sprintf "Must be between 0 and 63: %d" digit) + let char_of_int digit = + match digit >= 0 && digit < String.length base64 with + | true -> base64.[digit] + | _ -> raise (Char_of_int_failure digit) - let digit_of_char chr = - try String.index base64 chr - with Not_found -> raise (Invalid_base64 chr) + let int_of_char chr = + match String.index_opt base64 chr with + | Some index -> index + | None -> raise (Int_of_char_failure chr) end) diff --git a/src/vlq.mli b/src/vlq.mli index b79c466..88f26e6 100644 --- a/src/vlq.mli +++ b/src/vlq.mli @@ -5,12 +5,59 @@ * LICENSE file in the root directory of this source tree. *) +(** VLQ encoding and decoding. + + This module implements VLQ encoding and decoding with support for custom + shift and also embeds by default a Base 64 construction of the module. + Check {{:https://en.wikipedia.org/wiki/Variable-length_quantity} this article} +*) + +module type Config = sig + val shift : int + (** VLQ base shift to compute VLQ base *) + + val char_of_int : int -> char + (** Takes an int and returns the corresponding char for it, + which will be then used to build the encoded value. *) + + val int_of_char : char -> int + (** Takes a char and returns the corresponding int for it, + which will be then used to build the decoded value. *) +end + exception Unexpected_eof -exception Invalid_base64 of char + (** Happens when decoding a VLQ value with a continuation sign + eof *) + +exception Char_of_int_failure of int + (** Happens when the provided int cannot be converted to a char *) + +exception Int_of_char_failure of char + (** Happens when the provided char cannot be converted to an int *) module type S = sig - val encode: Buffer.t -> int -> unit - val decode: char Stream.t -> int + val encode : int -> string + (** [encode value] converts an integer for a VLQ string *) + + val decode : string -> int + (** [decode value] converts a VLQ string for an integer *) end + (** Module signature *) + +module Make (C: Config) : S + (** Expose [Make] to allow custom constructions *) module Base64 : S + (** A single base 64 digit can contain 6 bits of data. For the base 64 + variable length quantities we use in the source map spec, the first + bit is the sign, the next four bits are the actual value, and the 6th + bit is the continuation bit. The continuation bit tells us whether + there are more digits in this value following this digit. + + {v + Continuation + | Sign + | | + V V + 101011 + v} + *) diff --git a/test/jbuild b/test/jbuild index 18d1d82..0e96c84 100644 --- a/test/jbuild +++ b/test/jbuild @@ -2,7 +2,7 @@ (executable ( (name test) - (libraries (vlq oUnit)) + (libraries (vlq ounit)) )) (alias ( diff --git a/test/test.ml b/test/test.ml index e24037f..420cb69 100644 --- a/test/test.ml +++ b/test/test.ml @@ -7,54 +7,549 @@ open OUnit2 -let cases = [ - (* input, output *) +let vlqs = [ + -255, "/P"; + -254, "9P"; + -253, "7P"; + -252, "5P"; + -251, "3P"; + -250, "1P"; + -249, "zP"; + -248, "xP"; + -247, "vP"; + -246, "tP"; + -245, "rP"; + -244, "pP"; + -243, "nP"; + -242, "lP"; + -241, "jP"; + -240, "hP"; + -239, "/O"; + -238, "9O"; + -237, "7O"; + -236, "5O"; + -235, "3O"; + -234, "1O"; + -233, "zO"; + -232, "xO"; + -231, "vO"; + -230, "tO"; + -229, "rO"; + -228, "pO"; + -227, "nO"; + -226, "lO"; + -225, "jO"; + -224, "hO"; + -223, "/N"; + -222, "9N"; + -221, "7N"; + -220, "5N"; + -219, "3N"; + -218, "1N"; + -217, "zN"; + -216, "xN"; + -215, "vN"; + -214, "tN"; + -213, "rN"; + -212, "pN"; + -211, "nN"; + -210, "lN"; + -209, "jN"; + -208, "hN"; + -207, "/M"; + -206, "9M"; + -205, "7M"; + -204, "5M"; + -203, "3M"; + -202, "1M"; + -201, "zM"; + -200, "xM"; + -199, "vM"; + -198, "tM"; + -197, "rM"; + -196, "pM"; + -195, "nM"; + -194, "lM"; + -193, "jM"; + -192, "hM"; + -191, "/L"; + -190, "9L"; + -189, "7L"; + -188, "5L"; + -187, "3L"; + -186, "1L"; + -185, "zL"; + -184, "xL"; + -183, "vL"; + -182, "tL"; + -181, "rL"; + -180, "pL"; + -179, "nL"; + -178, "lL"; + -177, "jL"; + -176, "hL"; + -175, "/K"; + -174, "9K"; + -173, "7K"; + -172, "5K"; + -171, "3K"; + -170, "1K"; + -169, "zK"; + -168, "xK"; + -167, "vK"; + -166, "tK"; + -165, "rK"; + -164, "pK"; + -163, "nK"; + -162, "lK"; + -161, "jK"; + -160, "hK"; + -159, "/J"; + -158, "9J"; + -157, "7J"; + -156, "5J"; + -155, "3J"; + -154, "1J"; + -153, "zJ"; + -152, "xJ"; + -151, "vJ"; + -150, "tJ"; + -149, "rJ"; + -148, "pJ"; + -147, "nJ"; + -146, "lJ"; + -145, "jJ"; + -144, "hJ"; + -143, "/I"; + -142, "9I"; + -141, "7I"; + -140, "5I"; + -139, "3I"; + -138, "1I"; + -137, "zI"; + -136, "xI"; + -135, "vI"; + -134, "tI"; + -133, "rI"; + -132, "pI"; + -131, "nI"; + -130, "lI"; + -129, "jI"; + -128, "hI"; + -127, "/H"; + -126, "9H"; + -125, "7H"; + -124, "5H"; + -123, "3H"; + -122, "1H"; + -121, "zH"; + -120, "xH"; + -119, "vH"; + -118, "tH"; + -117, "rH"; + -116, "pH"; + -115, "nH"; + -114, "lH"; + -113, "jH"; + -112, "hH"; + -111, "/G"; + -110, "9G"; + -109, "7G"; + -108, "5G"; + -107, "3G"; + -106, "1G"; + -105, "zG"; + -104, "xG"; + -103, "vG"; + -102, "tG"; + -101, "rG"; + -100, "pG"; + -99, "nG"; + -98, "lG"; + -97, "jG"; + -96, "hG"; + -95, "/F"; + -94, "9F"; + -93, "7F"; + -92, "5F"; + -91, "3F"; + -90, "1F"; + -89, "zF"; + -88, "xF"; + -87, "vF"; + -86, "tF"; + -85, "rF"; + -84, "pF"; + -83, "nF"; + -82, "lF"; + -81, "jF"; + -80, "hF"; + -79, "/E"; + -78, "9E"; + -77, "7E"; + -76, "5E"; + -75, "3E"; + -74, "1E"; + -73, "zE"; + -72, "xE"; + -71, "vE"; + -70, "tE"; + -69, "rE"; + -68, "pE"; + -67, "nE"; + -66, "lE"; + -65, "jE"; + -64, "hE"; + -63, "/D"; + -62, "9D"; + -61, "7D"; + -60, "5D"; + -59, "3D"; + -58, "1D"; + -57, "zD"; + -56, "xD"; + -55, "vD"; + -54, "tD"; + -53, "rD"; + -52, "pD"; + -51, "nD"; + -50, "lD"; + -49, "jD"; + -48, "hD"; + -47, "/C"; + -46, "9C"; + -45, "7C"; + -44, "5C"; + -43, "3C"; + -42, "1C"; + -41, "zC"; + -40, "xC"; + -39, "vC"; + -38, "tC"; + -37, "rC"; + -36, "pC"; + -35, "nC"; + -34, "lC"; + -33, "jC"; + -32, "hC"; + -31, "/B"; + -30, "9B"; + -29, "7B"; + -28, "5B"; + -27, "3B"; + -26, "1B"; + -25, "zB"; + -24, "xB"; + -23, "vB"; + -22, "tB"; + -21, "rB"; + -20, "pB"; + -19, "nB"; + -18, "lB"; + -17, "jB"; + -16, "hB"; + -15, "f"; + -14, "d"; + -13, "b"; + -12, "Z"; + -11, "X"; + -10, "V"; + -9, "T"; + -8, "R"; + -7, "P"; + -6, "N"; + -5, "L"; + -4, "J"; + -3, "H"; + -2, "F"; + -1, "D"; 0, "A"; 1, "C"; - -1, "D"; + 2, "E"; + 3, "G"; + 4, "I"; + 5, "K"; + 6, "M"; + 7, "O"; + 8, "Q"; + 9, "S"; + 10, "U"; + 11, "W"; + 12, "Y"; + 13, "a"; + 14, "c"; + 15, "e"; + 16, "gB"; + 17, "iB"; + 18, "kB"; + 19, "mB"; + 20, "oB"; + 21, "qB"; + 22, "sB"; + 23, "uB"; + 24, "wB"; + 25, "yB"; + 26, "0B"; + 27, "2B"; + 28, "4B"; + 29, "6B"; + 30, "8B"; + 31, "+B"; + 32, "gC"; + 33, "iC"; + 34, "kC"; + 35, "mC"; + 36, "oC"; + 37, "qC"; + 38, "sC"; + 39, "uC"; + 40, "wC"; + 41, "yC"; + 42, "0C"; + 43, "2C"; + 44, "4C"; + 45, "6C"; + 46, "8C"; + 47, "+C"; + 48, "gD"; + 49, "iD"; + 50, "kD"; + 51, "mD"; + 52, "oD"; + 53, "qD"; + 54, "sD"; + 55, "uD"; + 56, "wD"; + 57, "yD"; + 58, "0D"; + 59, "2D"; + 60, "4D"; + 61, "6D"; + 62, "8D"; + 63, "+D"; + 64, "gE"; + 65, "iE"; + 66, "kE"; + 67, "mE"; + 68, "oE"; + 69, "qE"; + 70, "sE"; + 71, "uE"; + 72, "wE"; + 73, "yE"; + 74, "0E"; + 75, "2E"; + 76, "4E"; + 77, "6E"; + 78, "8E"; + 79, "+E"; + 80, "gF"; + 81, "iF"; + 82, "kF"; + 83, "mF"; + 84, "oF"; + 85, "qF"; + 86, "sF"; + 87, "uF"; + 88, "wF"; + 89, "yF"; + 90, "0F"; + 91, "2F"; + 92, "4F"; + 93, "6F"; + 94, "8F"; + 95, "+F"; + 96, "gG"; + 97, "iG"; + 98, "kG"; + 99, "mG"; + 100, "oG"; + 101, "qG"; + 102, "sG"; + 103, "uG"; + 104, "wG"; + 105, "yG"; + 106, "0G"; + 107, "2G"; + 108, "4G"; + 109, "6G"; + 110, "8G"; + 111, "+G"; + 112, "gH"; + 113, "iH"; + 114, "kH"; + 115, "mH"; + 116, "oH"; + 117, "qH"; + 118, "sH"; + 119, "uH"; + 120, "wH"; + 121, "yH"; + 122, "0H"; 123, "2H"; - 123456789, "qxmvrH"; + 124, "4H"; + 125, "6H"; + 126, "8H"; + 127, "+H"; + 128, "gI"; + 129, "iI"; + 130, "kI"; + 131, "mI"; + 132, "oI"; + 133, "qI"; + 134, "sI"; + 135, "uI"; + 136, "wI"; + 137, "yI"; + 138, "0I"; + 139, "2I"; + 140, "4I"; + 141, "6I"; + 142, "8I"; + 143, "+I"; + 144, "gJ"; + 145, "iJ"; + 146, "kJ"; + 147, "mJ"; + 148, "oJ"; + 149, "qJ"; + 150, "sJ"; + 151, "uJ"; + 152, "wJ"; + 157, "6J"; + 158, "8J"; + 159, "+J"; + 160, "gK"; + 161, "iK"; + 162, "kK"; + 163, "mK"; + 164, "oK"; + 165, "qK"; + 166, "sK"; + 167, "uK"; + 168, "wK"; + 169, "yK"; + 170, "0K"; + 171, "2K"; + 172, "4K"; + 173, "6K"; + 174, "8K"; + 175, "+K"; + 176, "gL"; + 177, "iL"; + 178, "kL"; + 179, "mL"; + 180, "oL"; + 181, "qL"; + 182, "sL"; + 183, "uL"; + 184, "wL"; + 185, "yL"; + 186, "0L"; + 187, "2L"; + 188, "4L"; + 189, "6L"; + 190, "8L"; + 191, "+L"; + 192, "gM"; + 193, "iM"; + 194, "kM"; + 195, "mM"; + 196, "oM"; + 197, "qM"; + 198, "sM"; + 199, "uM"; + 200, "wM"; + 201, "yM"; + 202, "0M"; + 203, "2M"; + 204, "4M"; + 205, "6M"; + 206, "8M"; + 207, "+M"; + 208, "gN"; + 209, "iN"; + 210, "kN"; + 211, "mN"; + 212, "oN"; + 213, "qN"; + 214, "sN"; + 215, "uN"; + 216, "wN"; + 217, "yN"; + 218, "0N"; + 219, "2N"; + 220, "4N"; + 221, "6N"; + 222, "8N"; + 223, "+N"; + 224, "gO"; + 225, "iO"; + 226, "kO"; + 227, "mO"; + 228, "oO"; + 229, "qO"; + 230, "sO"; + 231, "uO"; + 232, "wO"; + 233, "yO"; + 234, "0O"; + 235, "2O"; + 236, "4O"; + 237, "6O"; + 238, "8O"; + 239, "+O"; + 240, "gP"; + 241, "iP"; + 242, "kP"; + 243, "mP"; + 244, "oP"; + 245, "qP"; + 246, "sP"; + 247, "uP"; + 248, "wP"; + 249, "yP"; + 250, "0P"; + 251, "2P"; + 252, "4P"; + 253, "6P"; + 254, "8P"; + 255, "+P" ] let tests = "vlq" >::: [ "encode" >:: begin fun ctxt -> List.iter (fun (input, expected) -> - let buf = Buffer.create 10 in - Vlq.Base64.encode buf input; - let actual = Buffer.contents buf in + let actual = Vlq.Base64.encode input in assert_equal ~ctxt ~printer:(fun x -> x) ~msg:(Printf.sprintf "Vql.encode buf %d:" input) expected actual - ) cases + ) vlqs end; "decode" >:: begin fun ctxt -> List.iter (fun (input, expected) -> - let stream = Stream.of_string expected in - let actual = Vlq.Base64.decode stream in + let actual = Vlq.Base64.decode expected in assert_equal ~ctxt ~printer:string_of_int ~msg:(Printf.sprintf "Vql.decode %S:" expected) input actual - ) cases + ) vlqs end; "decode_extra" >:: begin fun ctxt -> - let stream = Stream.of_string "qxmvrH the rest is ignored" in - let actual = Vlq.Base64.decode stream in + let actual = Vlq.Base64.decode "qxmvrH the rest is ignored" in assert_equal ~ctxt ~printer:string_of_int 123456789 actual end; "decode_eof" >:: begin fun _ctxt -> - let stream = Stream.of_string "qxmvr" in assert_raises Vlq.Unexpected_eof (fun () -> - Vlq.Base64.decode stream + Vlq.Base64.decode "qxmvr" ) end; "decode_invalid" >:: begin fun _ctxt -> - let stream = Stream.of_string "qx." in - assert_raises (Vlq.Invalid_base64 '.') (fun () -> - Vlq.Base64.decode stream + assert_raises (Vlq.Int_of_char_failure '.') (fun () -> + Vlq.Base64.decode "qx." ) end; ] diff --git a/vlq.descr b/vlq.descr index 1332ddd..04f8211 100644 --- a/vlq.descr +++ b/vlq.descr @@ -1 +1 @@ -A simple library for encoding variable-length quantities. +A simple library for encoding and decoding variable-length quantities.