Skip to content

encoder

BinaryEncoder

Encodes Python physical types into bytes.

Source code in pyiceberg/avro/encoder.py
class BinaryEncoder:
    """Encodes Python physical types into bytes."""

    _output_stream: OutputStream

    def __init__(self, output_stream: OutputStream) -> None:
        self._output_stream = output_stream

    def write(self, b: bytes) -> None:
        self._output_stream.write(b)

    def write_boolean(self, boolean: bool) -> None:
        """Write a boolean as a single byte whose value is either 0 (false) or 1 (true).

        Args:
            boolean: The boolean to write.
        """
        self.write(bytearray([bool(boolean)]))

    def write_int(self, integer: int) -> None:
        """Integer and long values are written using variable-length zig-zag coding."""
        datum = (integer << 1) ^ (integer >> 63)
        while (datum & ~0x7F) != 0:
            self.write(bytearray([(datum & 0x7F) | 0x80]))
            datum >>= 7
        self.write(bytearray([datum]))

    def write_float(self, f: float) -> None:
        """Write a float as 4 bytes."""
        self.write(STRUCT_FLOAT.pack(f))

    def write_double(self, f: float) -> None:
        """Write a double as 8 bytes."""
        self.write(STRUCT_DOUBLE.pack(f))

    def write_bytes(self, b: bytes) -> None:
        """Bytes are encoded as a long followed by that many bytes of data."""
        self.write_int(len(b))
        self.write(b)

    def write_utf8(self, s: str) -> None:
        """Encode a string as a long followed by that many bytes of UTF-8 encoded character data."""
        self.write_bytes(s.encode(UTF8))

    def write_uuid(self, uuid: UUID) -> None:
        """Write UUID as a fixed[16].

        The uuid logical type represents a random generated universally unique identifier (UUID).
        An uuid logical type annotates an Avro string. The string has to conform with RFC-4122.
        """
        if len(uuid.bytes) != 16:
            raise ValueError(f"Expected UUID to have 16 bytes, got: len({uuid.bytes!r})")
        return self.write(uuid.bytes)

write_boolean(boolean)

Write a boolean as a single byte whose value is either 0 (false) or 1 (true).

Parameters:

Name Type Description Default
boolean bool

The boolean to write.

required
Source code in pyiceberg/avro/encoder.py
def write_boolean(self, boolean: bool) -> None:
    """Write a boolean as a single byte whose value is either 0 (false) or 1 (true).

    Args:
        boolean: The boolean to write.
    """
    self.write(bytearray([bool(boolean)]))

write_bytes(b)

Bytes are encoded as a long followed by that many bytes of data.

Source code in pyiceberg/avro/encoder.py
def write_bytes(self, b: bytes) -> None:
    """Bytes are encoded as a long followed by that many bytes of data."""
    self.write_int(len(b))
    self.write(b)

write_double(f)

Write a double as 8 bytes.

Source code in pyiceberg/avro/encoder.py
def write_double(self, f: float) -> None:
    """Write a double as 8 bytes."""
    self.write(STRUCT_DOUBLE.pack(f))

write_float(f)

Write a float as 4 bytes.

Source code in pyiceberg/avro/encoder.py
def write_float(self, f: float) -> None:
    """Write a float as 4 bytes."""
    self.write(STRUCT_FLOAT.pack(f))

write_int(integer)

Integer and long values are written using variable-length zig-zag coding.

Source code in pyiceberg/avro/encoder.py
def write_int(self, integer: int) -> None:
    """Integer and long values are written using variable-length zig-zag coding."""
    datum = (integer << 1) ^ (integer >> 63)
    while (datum & ~0x7F) != 0:
        self.write(bytearray([(datum & 0x7F) | 0x80]))
        datum >>= 7
    self.write(bytearray([datum]))

write_utf8(s)

Encode a string as a long followed by that many bytes of UTF-8 encoded character data.

Source code in pyiceberg/avro/encoder.py
def write_utf8(self, s: str) -> None:
    """Encode a string as a long followed by that many bytes of UTF-8 encoded character data."""
    self.write_bytes(s.encode(UTF8))

write_uuid(uuid)

Write UUID as a fixed[16].

The uuid logical type represents a random generated universally unique identifier (UUID). An uuid logical type annotates an Avro string. The string has to conform with RFC-4122.

Source code in pyiceberg/avro/encoder.py
def write_uuid(self, uuid: UUID) -> None:
    """Write UUID as a fixed[16].

    The uuid logical type represents a random generated universally unique identifier (UUID).
    An uuid logical type annotates an Avro string. The string has to conform with RFC-4122.
    """
    if len(uuid.bytes) != 16:
        raise ValueError(f"Expected UUID to have 16 bytes, got: len({uuid.bytes!r})")
    return self.write(uuid.bytes)