Skip to content

Commit

Permalink
decoding options for ETF
Browse files Browse the repository at this point in the history
  • Loading branch information
kvakvs committed Jun 2, 2017
1 parent 4c67c72 commit 9ecbccc
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 15 deletions.
37 changes: 29 additions & 8 deletions Pyrlang/Dist/etf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@

""" Module implements encoder and decoder from ETF (Erlang External Term Format)
used by the network distribution layer.
Encoding terms takes optional 'options' argument. Default is ``None`` but
it can be a dictionary with the following optional keys:
* "binaries_as_bytes", default False. Ignores bit tail of bit strings and
returns all Erlang binaries as Python bytes.
* "atoms_as_strings", default False. Always converts atoms to Python
strings. This is potentially faster than using the Atom wrapper class.
"""
from __future__ import print_function

Expand Down Expand Up @@ -66,9 +74,11 @@ def incomplete_data(where=""):
raise ETFDecodeException("Incomplete data at " + where)


def binary_to_term(data: bytes):
def binary_to_term(data: bytes, options: dict = {}):
""" Strip 131 header and unpack if the data was compressed.
:param data: The incoming encoded data with the 131 byte
:param options: See description on top of the module
:raises ETFDecodeException: when the tag is not 131, when compressed
data is incomplete or corrupted
"""
Expand All @@ -83,12 +93,12 @@ def binary_to_term(data: bytes):
# Data corruption?
raise ETFDecodeException("Compressed size mismatch with actual")

return binary_to_term_2(decomp)
return binary_to_term_2(decomp, options)

return binary_to_term_2(data[1:])
return binary_to_term_2(data[1:], options)


def _bytes_to_atom(name: bytes, encoding: str = 'utf8'):
def _bytes_to_atom(name: bytes, encoding: str, options: dict):
""" Recognize familiar atom values. """
if name == b'true':
return True
Expand All @@ -97,10 +107,14 @@ def _bytes_to_atom(name: bytes, encoding: str = 'utf8'):
if name == b'undefined':
return None

return term.Atom(name.decode(encoding))
if options.get("atoms_as_strings", False):
return name.decode(encoding)

return term.Atom(text=name.decode(encoding),
encoding=encoding)


def binary_to_term_2(data: bytes):
def binary_to_term_2(data: bytes, options: dict = {}):
""" Proceed decoding after leading tag has been checked and removed.
Erlang lists are decoded into ``term.List`` object, whose ``elements_``
Expand All @@ -112,6 +126,7 @@ def binary_to_term_2(data: bytes):
``dict``. Binaries and bit strings are decoded into ``term.Binary``
object, with optional last bits omitted.
:param options: See description on top of the module
:param data: Bytes containing encoded term without 131 header
:return: Tuple (Value, TailBytes) The function consumes as much data as
possible and returns the tail. Tail can be used again to parse
Expand All @@ -131,7 +146,7 @@ def binary_to_term_2(data: bytes):

name = data[3:len_expected]
enc = 'latin-1' if tag == TAG_ATOM_EXT else 'utf8'
return _bytes_to_atom(name, enc), data[len_expected:]
return _bytes_to_atom(name, enc, options), data[len_expected:]

if tag == [TAG_SMALL_ATOM_EXT, TAG_SMALL_ATOM_UTF8_EXT]:
len_data = len(data)
Expand All @@ -142,7 +157,7 @@ def binary_to_term_2(data: bytes):
name = data[2:len_expected]

enc = 'latin-1' if tag == TAG_SMALL_ATOM_EXT else 'utf8'
return term.Atom(name.decode(enc)), data[len_expected:]
return _bytes_to_atom(name, enc, options), data[len_expected:]

if tag == TAG_NIL_EXT:
return [], data[1:]
Expand Down Expand Up @@ -260,6 +275,9 @@ def binary_to_term_2(data: bytes):
if len_expected > len_data:
return incomplete_data("decoding data for a binary")

if options.get("binaries_as_bytes", False):
return data[5:len_expected], data[len_expected:]

bin1 = term.Binary(data=data[5:len_expected])
return bin1, data[len_expected:]

Expand All @@ -272,6 +290,9 @@ def binary_to_term_2(data: bytes):
if len_expected > len_data:
return incomplete_data("decoding data for a bit-binary")

if options.get("binaries_as_bytes", False):
return data[6:len_expected], data[len_expected:]

bin1 = term.Binary(data=data[6:len_expected],
last_byte_bits=lbb)
return bin1, data[len_expected:]
Expand Down
3 changes: 2 additions & 1 deletion Pyrlang/term.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ def __repr__(self) -> str:
def __str__(self):
return self.text_

def __init__(self, text: str) -> None:
def __init__(self, text: str, encoding: str = 'utf8') -> None:
self.text_ = text
self.enc_ = encoding

def equals(self, other) -> bool:
return isinstance(other, Atom) and self.text_ == other.text_
Expand Down
20 changes: 16 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@ Features
--------

* Based on gevent which supports Python 2 and 3;
* Erlang distribution protocol:
* Can encode lists, tuples, atoms, integers, floats, maps,
strings, binaries, pids, referenes, Python objects etc.;
* Can not encode functions, ports
* Erlang distribution protocol
* Registry of Python 'processes', which are gevent Greenlets and have a pid
and an optional registered name;
* Send and receive messages locally and remotely by pid or name;
Expand All @@ -31,6 +28,21 @@ Features
gen_server-style calls


| Erlang | Python | Notes |
|----------------------|-----------------------------|---------------------------------------------------------------------------------------|
| Integer, big integer | Python integer | Python is capable if big integers too |
| Float | Python float | |
| String | term.List | Has a method `as_unicode()` to get the string |
| Atom | term.Atom or string | Can use str() or access text_ field directly. Can decode both UTF8 and Latin-1 atoms. |
| List | term.List or Python list | |
| Tuple | Python tuple | |
| Map | Python dict | |
| Binary, bit string | term.Binary or Python bytes | A class which holds bytes and optional count for last byte bits |
| Pid, reference | term.Pid and term.Reference | Always long external Pids and Refs with a node name in them |
| Lambda (fun) | term.Fun | A class which holds parsed fun fields, not usable or useful in Python |
| | Any other object | Any unknown Python object will be encoded as {'Classname', #{field1 => value1...}} |


Building
--------

Expand Down
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ changes required.
:caption: Contents:

getting_started

node
process
term
Expand Down
20 changes: 18 additions & 2 deletions test/dist_etf_decode_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,18 @@ class TestETFDecode(unittest.TestCase):
def test_decode_atom(self):
""" Try an atom 'hello' """
b1 = bytes([131, 100, 0, 5, 104, 101, 108, 108, 111])
(t1, tail) = etf.binary_to_term(b1)
(t1, tail1) = etf.binary_to_term(b1)
self.assertTrue(isinstance(t1, term.Atom))
self.assertEqual(t1.text_, "hello")
self.assertEqual(tail, b'')
self.assertEqual(tail1, b'')

def test_decode_atom_as_string(self):
""" Try an atom 'hello' to a Python string """
b1 = bytes([131, 100, 0, 5, 104, 101, 108, 108, 111])
(t2, tail2) = etf.binary_to_term(b1, {"atoms_as_strings": True})
self.assertTrue(isinstance(t2, str))
self.assertEqual(t2, "hello")
self.assertEqual(tail2, b'')

def test_decode_str(self):
""" Try a simple ASCII string """
Expand Down Expand Up @@ -77,6 +85,14 @@ def test_decode_fun(self):
self.assertTrue(isinstance(val, term.Fun))
self.assertEqual(tail, b'')

def test_decode_binary(self):
""" Decode binary to term.Binary and to Python bytes and compare. """
data1 = bytes([131, 109, 0, 0, 0, 1, 34])
(val1, tail1) = etf.binary_to_term(data1)
(val2, tail2) = etf.binary_to_term(data1, {"binaries_as_bytes": True})
self.assertEqual(val1.bytes_, val2)
self.assertEqual(tail1, b'')
self.assertEqual(tail2, b'')

if __name__ == '__main__':
unittest.main()

0 comments on commit 9ecbccc

Please sign in to comment.