From 472d0255712996ed8d107dd54b53dbcf06549e53 Mon Sep 17 00:00:00 2001 From: urmastalimaa Date: Wed, 12 Feb 2025 12:10:10 +0200 Subject: [PATCH 1/2] Add UUID conversion to and from 16 byte fixed sequences UUIDs are often passed around in application code in their canonical, hex as string representation e.g. "550e8400-e29b-41d4-a716-446655440000". Encoding UUIDs as Avro "string"s takes 37 bytes, while encoding UUIDs in their binary form fits into a 16 byte sized "fixed", saving 21 bytes per encoding. This change allows application code to keep passing around canonical hex UUIDs while converting to the compact encoding, requiring only `uuid_format: :canonical_string` to be given in decode options. The [Java reference implementation][java-implementation] also supports encoding UUIDs as both strings and 16 byte fixed sequences. * Encoding is augmented such that a 16 byte fixed schema with `%{"logicalType" => "uuid"}`, converts a hex-string UUID to the 16 byte binary representation. * Decoding is augmented such that given `uuid_format: :canonical_string` in decode options, the binary representation is converted to the canonical hex-string representation. The encoding change is nearly backwards-compatible, previously when given an incorrectly size "fixed" with `{"logicalType": "uuid"}`, an error was raised, while now conversion is attempted. The decoding change is fully backwards-compatible, as `uuid_format` defaults to `:binary`. For UUID codec, the `uniq` library was added (no transitive dependencies). [java-implementation]: https://github.com/apache/avro/blob/230414abbb68e63e68f3b55bfc0cbca94f2737f6/lang/java/avro/src/main/java/org/apache/avro/LogicalTypes.java#L291-L309 --- lib/avro_ex.ex | 21 ++++++++++++++ lib/avro_ex/decode.ex | 19 ++++++++++++ lib/avro_ex/decode_error.ex | 5 ++++ lib/avro_ex/encode.ex | 5 ++++ lib/avro_ex/schema/fixed.ex | 5 ++++ mix.exs | 3 +- mix.lock | 1 + test/decode_test.exs | 27 +++++++++++++++++ test/encode_test.exs | 58 +++++++++++++++++++++++++------------ 9 files changed, 124 insertions(+), 20 deletions(-) diff --git a/lib/avro_ex.ex b/lib/avro_ex.ex index 89cc30d..b9f3548 100644 --- a/lib/avro_ex.ex +++ b/lib/avro_ex.ex @@ -136,6 +136,14 @@ defmodule AvroEx do of blocks with their counts. This allows consumers of the encoded data to skip over those blocks in an efficient manner. Using the option `include_block_byte_size: true` enables adding those additional values. + + ## UUID encoding + + UUIDs can be decoded as strings using the canonical hex representation with 37 bytes. + Alternatively, encoding UUIDs in their 16 byte binary representation is much + more compact, saving 21 bytes per encoding. + See "UUIDs" on `decode/3` for how to convert binary representations back to + canonical strings during decoding. """ @spec encode(Schema.t(), term, keyword()) :: {:ok, encoded_avro} | {:error, AvroEx.EncodeError.t() | Exception.t()} @@ -185,6 +193,19 @@ defmodule AvroEx do Otherwise, an approximate number is calculated. + ## UUIDs + + When decoding a 16 byte fixed quantity with logical type "uuid", specify + `uuid_format: :binary` to retain the binary representation or + `uuid_format: :canonical_string` to convert to the canonical, hex as string representation. + + iex> schema = AvroEx.decode_schema!(~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType":"uuid"})) + iex> binary_uuid = <<85, 14, 132, 0, 226, 155, 65, 212, 167, 22, 68, 102, 85, 68, 0, 0>> + iex> AvroEx.decode(schema, binary_uuid, uuid_format: :binary) + {:ok, binary_uuid} + iex> AvroEx.decode(schema, binary_uuid, uuid_format: :canonical_string) + {:ok, "550e8400-e29b-41d4-a716-446655440000"} + """ @spec decode(Schema.t(), encoded_avro, keyword()) :: {:ok, term} diff --git a/lib/avro_ex/decode.ex b/lib/avro_ex/decode.ex index 8a8d37f..f84b2b5 100644 --- a/lib/avro_ex/decode.ex +++ b/lib/avro_ex/decode.ex @@ -270,6 +270,25 @@ defmodule AvroEx.Decode do {:lists.nth(index + 1, symbols), rest} end + defp do_decode(%Fixed{size: size = 16, metadata: %{"logicalType" => "uuid"}}, %Context{}, data, opts) + when is_binary(data) do + <> = data + + case Keyword.get(opts, :uuid_format, :binary) do + :binary -> + {fixed, rest} + + :canonical_string -> + case Uniq.UUID.parse(fixed) do + {:ok, uuid} -> + {Uniq.UUID.to_string(uuid, :default), rest} + + _ -> + error({:invalid_binary_uuid, fixed}) + end + end + end + defp do_decode(%Fixed{size: size}, %Context{}, data, _) when is_binary(data) do <> = data {fixed, rest} diff --git a/lib/avro_ex/decode_error.ex b/lib/avro_ex/decode_error.ex index 5d5ac62..f170cab 100644 --- a/lib/avro_ex/decode_error.ex +++ b/lib/avro_ex/decode_error.ex @@ -12,4 +12,9 @@ defmodule AvroEx.DecodeError do message = "Invalid UTF-8 string found #{inspect(str)}." %__MODULE__{message: message} end + + def new({:invalid_binary_uuid, binary_uuid}) do + message = "Invalid binary UUID found #{inspect(binary_uuid)}." + %__MODULE__{message: message} + end end diff --git a/lib/avro_ex/encode.ex b/lib/avro_ex/encode.ex index 6e3a671..75c7d36 100644 --- a/lib/avro_ex/encode.ex +++ b/lib/avro_ex/encode.ex @@ -169,6 +169,11 @@ defmodule AvroEx.Encode do bin end + defp do_encode(%Fixed{size: 16, metadata: %{"logicalType" => "uuid"}} = f, %Context{} = context, bin, opts) + when is_binary(bin) do + do_encode(f, context, Uniq.UUID.string_to_binary!(bin), opts) + end + defp do_encode(%Fixed{} = fixed, %Context{} = context, bin, _) when is_binary(bin) do error({:incorrect_fixed_size, fixed, bin, context}) end diff --git a/lib/avro_ex/schema/fixed.ex b/lib/avro_ex/schema/fixed.ex index 18e2e30..c363864 100644 --- a/lib/avro_ex/schema/fixed.ex +++ b/lib/avro_ex/schema/fixed.ex @@ -19,5 +19,10 @@ defmodule AvroEx.Schema.Fixed do true end + def match?(%__MODULE__{size: 16, metadata: %{"logicalType" => "uuid"}}, %Context{}, data) + when is_binary(data) do + Uniq.UUID.valid?(data) + end + def match?(_fixed, _context, _data), do: false end diff --git a/mix.exs b/mix.exs index b426c27..7be05cd 100644 --- a/mix.exs +++ b/mix.exs @@ -37,7 +37,8 @@ defmodule AvroEx.Mixfile do {:dialyxir, "~> 1.1", only: :dev, runtime: false}, {:ex_doc, "~> 0.20", only: :dev, runtime: false}, {:stream_data, "~> 0.5", only: [:dev, :test]}, - {:decimal, "~> 2.0", optional: true} + {:decimal, "~> 2.0", optional: true}, + {:uniq, "~> 0.6"} ] end diff --git a/mix.lock b/mix.lock index 0c9db1d..324b10f 100644 --- a/mix.lock +++ b/mix.lock @@ -14,4 +14,5 @@ "nimble_parsec": {:hex, :nimble_parsec, "1.2.3", "244836e6e3f1200c7f30cb56733fd808744eca61fd182f731eac4af635cc6d0b", [:mix], [], "hexpm", "c8d789e39b9131acf7b99291e93dae60ab48ef14a7ee9d58c6964f59efb570b0"}, "stream_data": {:hex, :stream_data, "0.5.0", "b27641e58941685c75b353577dc602c9d2c12292dd84babf506c2033cd97893e", [:mix], [], "hexpm", "012bd2eec069ada4db3411f9115ccafa38540a3c78c4c0349f151fc761b9e271"}, "typed_struct": {:hex, :typed_struct, "0.3.0", "939789e3c1dca39d7170c87f729127469d1315dcf99fee8e152bb774b17e7ff7", [:mix], [], "hexpm", "c50bd5c3a61fe4e198a8504f939be3d3c85903b382bde4865579bc23111d1b6d"}, + "uniq": {:hex, :uniq, "0.6.1", "369660ecbc19051be526df3aa85dc393af5f61f45209bce2fa6d7adb051ae03c", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm", "6426c34d677054b3056947125b22e0daafd10367b85f349e24ac60f44effb916"}, } diff --git a/test/decode_test.exs b/test/decode_test.exs index 7b403e6..42d6f83 100644 --- a/test/decode_test.exs +++ b/test/decode_test.exs @@ -344,6 +344,20 @@ defmodule AvroEx.Decode.Test do "decimalField4" => 5.3e-11 } end + + test "16 byte fixed uuid" do + {:ok, fixed_uuid_schema} = + AvroEx.decode_schema(~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType":"uuid"})) + + # Example from https://en.wikipedia.org/wiki/Universally_unique_identifier#Textual_representation + canonical_string = "550e8400-e29b-41d4-a716-446655440000" + binary = :binary.encode_unsigned(113_059_749_145_936_325_402_354_257_176_981_405_696) + + assert {:ok, ^binary} = AvroEx.decode(fixed_uuid_schema, binary, uuid_format: :binary) + assert {:ok, ^binary} = AvroEx.decode(fixed_uuid_schema, binary) + + assert {:ok, ^canonical_string} = AvroEx.decode(fixed_uuid_schema, binary, uuid_format: :canonical_string) + end end describe "DecodingError" do @@ -354,5 +368,18 @@ defmodule AvroEx.Decode.Test do AvroEx.decode!(schema, <<"\nhell", 0xFFFF::16>>) end end + + test "invalid fixed uuid" do + {:ok, fixed_uuid_schema} = + AvroEx.decode_schema(~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType":"uuid"})) + + non_uuid_binary = :binary.list_to_bin(List.duplicate(1, 16)) + + assert_raise DecodeError, + "Invalid binary UUID found <<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>>.", + fn -> + AvroEx.decode!(fixed_uuid_schema, non_uuid_binary, uuid_format: :canonical_string) + end + end end end diff --git a/test/encode_test.exs b/test/encode_test.exs index 1b69d69..130bfc9 100644 --- a/test/encode_test.exs +++ b/test/encode_test.exs @@ -134,6 +134,19 @@ defmodule AvroEx.Encode.Test do "decimalField4" => 5.3e-11 } end + + test "16 byte fixed uuid" do + assert %AvroEx.Schema{} = + schema = + AvroEx.decode_schema!(%{"type" => "fixed", "size" => 16, "name" => "fixed_uuid", "logicalType" => "uuid"}) + + # Example from https://en.wikipedia.org/wiki/Universally_unique_identifier#Textual_representation + canonical_string = "550e8400-e29b-41d4-a716-446655440000" + binary = :binary.encode_unsigned(113_059_749_145_936_325_402_354_257_176_981_405_696) + + assert {:ok, ^binary} = AvroEx.encode(schema, canonical_string) + assert {:ok, ^binary} = AvroEx.encode(schema, binary) + end end describe "variable_integer_encode" do @@ -282,51 +295,50 @@ defmodule AvroEx.Encode.Test do end describe "encode (union)" do + defp union_index(index) do + {:ok, int_schema} = AvroEx.decode_schema(~S("int")) + {:ok, index} = @test_module.encode(int_schema, index) + index + end + test "works as expected with nulls" do {:ok, schema} = AvroEx.decode_schema(~S(["null", "int"])) {:ok, null_schema} = AvroEx.decode_schema(~S("null")) - {:ok, int_schema} = AvroEx.decode_schema(~S("int")) - {:ok, index} = @test_module.encode(int_schema, 0) {:ok, encoded_null} = @test_module.encode(null_schema, nil) {:ok, encoded_union} = @test_module.encode(schema, nil) - assert encoded_union == index <> encoded_null + assert encoded_union == union_index(0) <> encoded_null end test "works as expected with ints" do {:ok, schema} = AvroEx.decode_schema(~S(["null", "int"])) {:ok, int_schema} = AvroEx.decode_schema(~S("int")) - {:ok, index} = @test_module.encode(int_schema, 1) {:ok, encoded_int} = @test_module.encode(int_schema, 2086) {:ok, encoded_union} = @test_module.encode(schema, 2086) - assert encoded_union == index <> encoded_int + assert encoded_union == union_index(1) <> encoded_int end test "works as expected with int and long" do {:ok, schema} = AvroEx.decode_schema(~S(["int", "long"])) - {:ok, int_schema} = AvroEx.decode_schema(~S("int")) {:ok, long_schema} = AvroEx.decode_schema(~S("long")) - {:ok, index} = @test_module.encode(int_schema, 1) {:ok, encoded_long} = @test_module.encode(long_schema, -3_376_656_585_598_455_353) {:ok, encoded_union} = @test_module.encode(schema, -3_376_656_585_598_455_353) - assert encoded_union == index <> encoded_long + assert encoded_union == union_index(1) <> encoded_long end test "works as expected with float and double" do {:ok, schema} = AvroEx.decode_schema(~S(["float", "double"])) - {:ok, int_schema} = AvroEx.decode_schema(~S("int")) {:ok, double_schema} = AvroEx.decode_schema(~S("double")) - {:ok, index} = @test_module.encode(int_schema, 1) {:ok, encoded_long} = @test_module.encode(double_schema, 0.0000000001) {:ok, encoded_union} = @test_module.encode(schema, 0.0000000001) - assert encoded_union == index <> encoded_long + assert encoded_union == union_index(1) <> encoded_long end test "works as expected with logical types" do @@ -336,11 +348,23 @@ defmodule AvroEx.Encode.Test do {:ok, schema} = AvroEx.decode_schema(~s(["null", #{datetime_json}])) {:ok, datetime_schema} = AvroEx.decode_schema(datetime_json) - {:ok, index} = @test_module.encode(datetime_schema, 1) {:ok, encoded_datetime} = @test_module.encode(datetime_schema, datetime_value) {:ok, encoded_union} = @test_module.encode(schema, datetime_value) - assert encoded_union == index <> encoded_datetime + assert encoded_union == union_index(1) <> encoded_datetime + end + + test "works as expected with 16 byte fixed UUID logical types" do + fixed_uuid_json = ~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType": "uuid"}) + uuid_value = "550e8400-e29b-41d4-a716-446655440000" + + {:ok, schema} = AvroEx.decode_schema(~s(["null", #{fixed_uuid_json}])) + {:ok, fixed_uuid_schema} = AvroEx.decode_schema(fixed_uuid_json) + + {:ok, encoded_uuid} = @test_module.encode(fixed_uuid_schema, uuid_value) + {:ok, encoded_union} = @test_module.encode(schema, uuid_value) + + assert encoded_union == union_index(1) <> encoded_uuid end test "works as expected with records" do @@ -358,14 +382,12 @@ defmodule AvroEx.Encode.Test do json_schema = ~s(["null", #{record_json}]) {:ok, schema} = AvroEx.decode_schema(json_schema) - {:ok, int_schema} = AvroEx.decode_schema(~S("int")) {:ok, record_schema} = AvroEx.decode_schema(record_json) - {:ok, index} = @test_module.encode(int_schema, 1) {:ok, encoded_record} = @test_module.encode(record_schema, %{"a" => 25, "b" => "hello"}) {:ok, encoded_union} = @test_module.encode(schema, %{"a" => 25, "b" => "hello"}) - assert encoded_union == index <> encoded_record + assert encoded_union == union_index(1) <> encoded_record end test "works as expected with union values tagged for a named possibility" do @@ -384,14 +406,12 @@ defmodule AvroEx.Encode.Test do json_schema = ~s([#{record_json_factory.("a")}, #{record_json_factory.("b")}]) {:ok, schema} = AvroEx.decode_schema(json_schema) - {:ok, int_schema} = AvroEx.decode_schema(~S("int")) {:ok, record_schema} = AvroEx.decode_schema(record_json_factory.("b")) - {:ok, index} = @test_module.encode(int_schema, 1) {:ok, encoded_record} = @test_module.encode(record_schema, %{"value" => "hello"}) {:ok, encoded_union} = @test_module.encode(schema, {"b", %{"value" => "hello"}}) - assert encoded_union == index <> encoded_record + assert encoded_union == union_index(1) <> encoded_record end test "errors with a clear error for tagged unions" do From c5abbde8b60e90226f4390e8e7201c1b1f317754 Mon Sep 17 00:00:00 2001 From: urmastalimaa Date: Fri, 21 Feb 2025 13:59:22 +0200 Subject: [PATCH 2/2] Fix ubuntu version in CI workflows setup-beam does not allow ubuntu-24 --- .github/workflows/ci.yml | 4 ++-- .github/workflows/deploy.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0ade5bc..11d6f3f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,7 +9,7 @@ on: jobs: test: name: Build and test - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 @@ -35,7 +35,7 @@ jobs: dialyzer: name: Run Dialyzer for type checking - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Set mix file hash diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index ada123e..c83a132 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -6,7 +6,7 @@ on: jobs: Publish: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 env: HEX_API_KEY: ${{ secrets.HEXPM_SECRET }} steps: