Skip to content

Commit

Permalink
Add UUID conversion to and from 16 byte fixed sequences
Browse files Browse the repository at this point in the history
UUIDs are often passed around in application code in their canonical,
hex as string representation e.g. "550e8400-e29b-41d4-a716-446655440000".
Encoding UUIDs as Avro "string"s takes 37 bytes, while encoding UUIDs in
their binary form fits into a 16 byte sized "fixed", saving 21 bytes per
encoding.

This change allows application code to keep passing around canonical hex
UUIDs while converting to the compact encoding, requiring only
`uuid_format: :canonical_string` to be given in decode options.

The [Java reference implementation][java-implementation] also supports
encoding UUIDs as both strings and 16 byte fixed sequences.

* Encoding is augmented such that a 16 byte fixed schema with
  `%{"logicalType" => "uuid"}`, converts a hex-string UUID to the 16
  byte binary representation.

* Decoding is augmented such that given `uuid_format: :canonical_string`
  in decode options, the binary representation is converted to the
  canonical hex-string representation.

The encoding change is nearly backwards-compatible, previously when
given an incorrectly size "fixed" with `{"logicalType": "uuid"}`, an
error was raised, while now conversion is attempted.

The decoding change is fully backwards-compatible, as `uuid_format`
defaults to `:binary`.

For UUID codec, the `uniq` library was added (no transitive
dependencies).

[java-implementation]: https://github.com/apache/avro/blob/230414abbb68e63e68f3b55bfc0cbca94f2737f6/lang/java/avro/src/main/java/org/apache/avro/LogicalTypes.java#L291-L309
  • Loading branch information
urmastalimaa committed Feb 12, 2025
1 parent f4091e2 commit e2bfb37
Show file tree
Hide file tree
Showing 9 changed files with 124 additions and 20 deletions.
21 changes: 21 additions & 0 deletions lib/avro_ex.ex
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,14 @@ defmodule AvroEx do
of blocks with their counts. This allows consumers of the encoded data to skip
over those blocks in an efficient manner. Using the option `include_block_byte_size: true`
enables adding those additional values.
## UUID encoding
UUIDs can be decoded as strings using the canonical hex representation with 37 bytes.
Alternatively, encoding UUIDs in their 16 byte binary representation is much
more compact, saving 21 bytes per encoding.
See "UUIDs" on `decode/3` for how to convert binary representations back to
canonical strings during decoding.
"""
@spec encode(Schema.t(), term, keyword()) ::
{:ok, encoded_avro} | {:error, AvroEx.EncodeError.t() | Exception.t()}
Expand Down Expand Up @@ -185,6 +193,19 @@ defmodule AvroEx do
Otherwise, an approximate number is calculated.
## UUIDs
When decoding a 16 byte fixed quantity with logical type "uuid", specify
`uuid_format: :binary` to retain the binary representation or
`uuid_format: :canonical_string` to convert to the canonical, hex as string representation.
iex> schema = AvroEx.decode_schema!(~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType":"uuid"}))
iex> binary_uuid = <<85, 14, 132, 0, 226, 155, 65, 212, 167, 22, 68, 102, 85, 68, 0, 0>>
iex> AvroEx.decode(schema, binary_uuid, uuid_format: :binary)
{:ok, binary_uuid}
iex> AvroEx.decode(schema, binary_uuid, uuid_format: :canonical_string)
{:ok, "550e8400-e29b-41d4-a716-446655440000"}
"""
@spec decode(Schema.t(), encoded_avro, keyword()) ::
{:ok, term}
Expand Down
19 changes: 19 additions & 0 deletions lib/avro_ex/decode.ex
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,25 @@ defmodule AvroEx.Decode do
{:lists.nth(index + 1, symbols), rest}
end

defp do_decode(%Fixed{size: size = 16, metadata: %{"logicalType" => "uuid"}}, %Context{}, data, opts)
when is_binary(data) do
<<fixed::binary-size(size), rest::binary>> = data

case Keyword.get(opts, :uuid_format, :binary) do
:binary ->
{fixed, rest}

:canonical_string ->
case Uniq.UUID.parse(fixed) do
{:ok, uuid} ->
{Uniq.UUID.to_string(uuid, :default), rest}

_ ->
error({:invalid_binary_uuid, fixed})
end
end
end

defp do_decode(%Fixed{size: size}, %Context{}, data, _) when is_binary(data) do
<<fixed::binary-size(size), rest::binary>> = data
{fixed, rest}
Expand Down
5 changes: 5 additions & 0 deletions lib/avro_ex/decode_error.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,9 @@ defmodule AvroEx.DecodeError do
message = "Invalid UTF-8 string found #{inspect(str)}."
%__MODULE__{message: message}
end

def new({:invalid_binary_uuid, binary_uuid}) do
message = "Invalid binary UUID found #{inspect(binary_uuid)}."
%__MODULE__{message: message}
end
end
5 changes: 5 additions & 0 deletions lib/avro_ex/encode.ex
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@ defmodule AvroEx.Encode do
bin
end

defp do_encode(%Fixed{size: 16, metadata: %{"logicalType" => "uuid"}} = f, %Context{} = context, bin, opts)
when is_binary(bin) do
do_encode(f, context, Uniq.UUID.string_to_binary!(bin), opts)
end

defp do_encode(%Fixed{} = fixed, %Context{} = context, bin, _) when is_binary(bin) do
error({:incorrect_fixed_size, fixed, bin, context})
end
Expand Down
5 changes: 5 additions & 0 deletions lib/avro_ex/schema/fixed.ex
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,10 @@ defmodule AvroEx.Schema.Fixed do
true
end

def match?(%__MODULE__{size: 16, metadata: %{"logicalType" => "uuid"}}, %Context{}, data)
when is_binary(data) do
Uniq.UUID.valid?(data)
end

def match?(_fixed, _context, _data), do: false
end
3 changes: 2 additions & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ defmodule AvroEx.Mixfile do
{:dialyxir, "~> 1.1", only: :dev, runtime: false},
{:ex_doc, "~> 0.20", only: :dev, runtime: false},
{:stream_data, "~> 0.5", only: [:dev, :test]},
{:decimal, "~> 2.0", optional: true}
{:decimal, "~> 2.0", optional: true},
{:uniq, "~> 0.6"}
]
end

Expand Down
1 change: 1 addition & 0 deletions mix.lock
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@
"nimble_parsec": {:hex, :nimble_parsec, "1.2.3", "244836e6e3f1200c7f30cb56733fd808744eca61fd182f731eac4af635cc6d0b", [:mix], [], "hexpm", "c8d789e39b9131acf7b99291e93dae60ab48ef14a7ee9d58c6964f59efb570b0"},
"stream_data": {:hex, :stream_data, "0.5.0", "b27641e58941685c75b353577dc602c9d2c12292dd84babf506c2033cd97893e", [:mix], [], "hexpm", "012bd2eec069ada4db3411f9115ccafa38540a3c78c4c0349f151fc761b9e271"},
"typed_struct": {:hex, :typed_struct, "0.3.0", "939789e3c1dca39d7170c87f729127469d1315dcf99fee8e152bb774b17e7ff7", [:mix], [], "hexpm", "c50bd5c3a61fe4e198a8504f939be3d3c85903b382bde4865579bc23111d1b6d"},
"uniq": {:hex, :uniq, "0.6.1", "369660ecbc19051be526df3aa85dc393af5f61f45209bce2fa6d7adb051ae03c", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm", "6426c34d677054b3056947125b22e0daafd10367b85f349e24ac60f44effb916"},
}
27 changes: 27 additions & 0 deletions test/decode_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,20 @@ defmodule AvroEx.Decode.Test do
"decimalField4" => 5.3e-11
}
end

test "16 byte fixed uuid" do
{:ok, fixed_uuid_schema} =
AvroEx.decode_schema(~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType":"uuid"}))

# Example from https://en.wikipedia.org/wiki/Universally_unique_identifier#Textual_representation
canonical_string = "550e8400-e29b-41d4-a716-446655440000"
binary = :binary.encode_unsigned(113_059_749_145_936_325_402_354_257_176_981_405_696)

assert {:ok, ^binary} = AvroEx.decode(fixed_uuid_schema, binary, uuid_format: :binary)
assert {:ok, ^binary} = AvroEx.decode(fixed_uuid_schema, binary)

assert {:ok, ^canonical_string} = AvroEx.decode(fixed_uuid_schema, binary, uuid_format: :canonical_string)
end
end

describe "DecodingError" do
Expand All @@ -354,5 +368,18 @@ defmodule AvroEx.Decode.Test do
AvroEx.decode!(schema, <<"\nhell", 0xFFFF::16>>)
end
end

test "invalid fixed uuid" do
{:ok, fixed_uuid_schema} =
AvroEx.decode_schema(~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType":"uuid"}))

non_uuid_binary = :binary.list_to_bin(List.duplicate(1, 16))

assert_raise DecodeError,
"Invalid binary UUID found <<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>>.",
fn ->
AvroEx.decode!(fixed_uuid_schema, non_uuid_binary, uuid_format: :canonical_string)
end
end
end
end
58 changes: 39 additions & 19 deletions test/encode_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,19 @@ defmodule AvroEx.Encode.Test do
"decimalField4" => 5.3e-11
}
end

test "16 byte fixed uuid" do
assert %AvroEx.Schema{} =
schema =
AvroEx.decode_schema!(%{"type" => "fixed", "size" => 16, "name" => "fixed_uuid", "logicalType" => "uuid"})

# Example from https://en.wikipedia.org/wiki/Universally_unique_identifier#Textual_representation
canonical_string = "550e8400-e29b-41d4-a716-446655440000"
binary = :binary.encode_unsigned(113_059_749_145_936_325_402_354_257_176_981_405_696)

assert {:ok, ^binary} = AvroEx.encode(schema, canonical_string)
assert {:ok, ^binary} = AvroEx.encode(schema, binary)
end
end

describe "variable_integer_encode" do
Expand Down Expand Up @@ -282,51 +295,50 @@ defmodule AvroEx.Encode.Test do
end

describe "encode (union)" do
defp union_index(index) do
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
{:ok, index} = @test_module.encode(int_schema, index)
index
end

test "works as expected with nulls" do
{:ok, schema} = AvroEx.decode_schema(~S(["null", "int"]))
{:ok, null_schema} = AvroEx.decode_schema(~S("null"))
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))

{:ok, index} = @test_module.encode(int_schema, 0)
{:ok, encoded_null} = @test_module.encode(null_schema, nil)
{:ok, encoded_union} = @test_module.encode(schema, nil)

assert encoded_union == index <> encoded_null
assert encoded_union == union_index(0) <> encoded_null
end

test "works as expected with ints" do
{:ok, schema} = AvroEx.decode_schema(~S(["null", "int"]))
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))

{:ok, index} = @test_module.encode(int_schema, 1)
{:ok, encoded_int} = @test_module.encode(int_schema, 2086)
{:ok, encoded_union} = @test_module.encode(schema, 2086)

assert encoded_union == index <> encoded_int
assert encoded_union == union_index(1) <> encoded_int
end

test "works as expected with int and long" do
{:ok, schema} = AvroEx.decode_schema(~S(["int", "long"]))
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
{:ok, long_schema} = AvroEx.decode_schema(~S("long"))

{:ok, index} = @test_module.encode(int_schema, 1)
{:ok, encoded_long} = @test_module.encode(long_schema, -3_376_656_585_598_455_353)
{:ok, encoded_union} = @test_module.encode(schema, -3_376_656_585_598_455_353)

assert encoded_union == index <> encoded_long
assert encoded_union == union_index(1) <> encoded_long
end

test "works as expected with float and double" do
{:ok, schema} = AvroEx.decode_schema(~S(["float", "double"]))
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
{:ok, double_schema} = AvroEx.decode_schema(~S("double"))

{:ok, index} = @test_module.encode(int_schema, 1)
{:ok, encoded_long} = @test_module.encode(double_schema, 0.0000000001)
{:ok, encoded_union} = @test_module.encode(schema, 0.0000000001)

assert encoded_union == index <> encoded_long
assert encoded_union == union_index(1) <> encoded_long
end

test "works as expected with logical types" do
Expand All @@ -336,11 +348,23 @@ defmodule AvroEx.Encode.Test do
{:ok, schema} = AvroEx.decode_schema(~s(["null", #{datetime_json}]))
{:ok, datetime_schema} = AvroEx.decode_schema(datetime_json)

{:ok, index} = @test_module.encode(datetime_schema, 1)
{:ok, encoded_datetime} = @test_module.encode(datetime_schema, datetime_value)
{:ok, encoded_union} = @test_module.encode(schema, datetime_value)

assert encoded_union == index <> encoded_datetime
assert encoded_union == union_index(1) <> encoded_datetime
end

test "works as expected with 16 byte fixed UUID logical types" do
fixed_uuid_json = ~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType": "uuid"})
uuid_value = "550e8400-e29b-41d4-a716-446655440000"

{:ok, schema} = AvroEx.decode_schema(~s(["null", #{fixed_uuid_json}]))
{:ok, fixed_uuid_schema} = AvroEx.decode_schema(fixed_uuid_json)

{:ok, encoded_uuid} = @test_module.encode(fixed_uuid_schema, uuid_value)
{:ok, encoded_union} = @test_module.encode(schema, uuid_value)

assert encoded_union == union_index(1) <> encoded_uuid
end

test "works as expected with records" do
Expand All @@ -358,14 +382,12 @@ defmodule AvroEx.Encode.Test do
json_schema = ~s(["null", #{record_json}])

{:ok, schema} = AvroEx.decode_schema(json_schema)
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
{:ok, record_schema} = AvroEx.decode_schema(record_json)

{:ok, index} = @test_module.encode(int_schema, 1)
{:ok, encoded_record} = @test_module.encode(record_schema, %{"a" => 25, "b" => "hello"})
{:ok, encoded_union} = @test_module.encode(schema, %{"a" => 25, "b" => "hello"})

assert encoded_union == index <> encoded_record
assert encoded_union == union_index(1) <> encoded_record
end

test "works as expected with union values tagged for a named possibility" do
Expand All @@ -384,14 +406,12 @@ defmodule AvroEx.Encode.Test do
json_schema = ~s([#{record_json_factory.("a")}, #{record_json_factory.("b")}])

{:ok, schema} = AvroEx.decode_schema(json_schema)
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
{:ok, record_schema} = AvroEx.decode_schema(record_json_factory.("b"))

{:ok, index} = @test_module.encode(int_schema, 1)
{:ok, encoded_record} = @test_module.encode(record_schema, %{"value" => "hello"})
{:ok, encoded_union} = @test_module.encode(schema, {"b", %{"value" => "hello"}})

assert encoded_union == index <> encoded_record
assert encoded_union == union_index(1) <> encoded_record
end

test "errors with a clear error for tagged unions" do
Expand Down

0 comments on commit e2bfb37

Please sign in to comment.