From a1f075d108ea559bb8c189aabc6f2c531a2185d4 Mon Sep 17 00:00:00 2001
From: scharlottej13 <sarah@coiled.io>
Date: Fri, 7 Oct 2022 15:34:19 -0700
Subject: [PATCH 1/2] start to python script version of join-datagen.R

---
 scripts/convert-join-datagen.ipynb | 192 +++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)
 create mode 100644 scripts/convert-join-datagen.ipynb

diff --git a/scripts/convert-join-datagen.ipynb b/scripts/convert-join-datagen.ipynb
new file mode 100644
index 0000000..e0f1a48
--- /dev/null
+++ b/scripts/convert-join-datagen.ipynb
@@ -0,0 +1,192 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b3693532-7b3a-44bc-ae50-ea90c940ff2f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_single_df(N, K, nfiles, dir, i):\n",
+    "    \"\"\"\n",
+    "    Creates a single pandas dataframe that contains nrows=N/nfiles\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    N: int,\n",
+    "     Total number of rows\n",
+    "    K: int,\n",
+    "     Number of groups\n",
+    "    nfiles: int,\n",
+    "     Number of output files\n",
+    "    dir: str,\n",
+    "     Output directory\n",
+    "    i: int,\n",
+    "     Integer to assign to the multiple files e.g. range(nfiles)\n",
+    "    \"\"\"\n",
+    "\n",
+    "    nrows = int(N / nfiles)\n",
+    "\n",
+    "    sample_id12 = [f\"id{str(x).zfill(3)}\" for x in range(1, K + 1)]\n",
+    "    sample_id3 = [f\"id{str(x).zfill(10)}\" for x in range(1, int(N / K) + 1)]\n",
+    "\n",
+    "    id1 = np.random.choice(sample_id12, size=nrows, replace=True)\n",
+    "    id2 = np.random.choice(sample_id12, size=nrows, replace=True)\n",
+    "    id3 = np.random.choice(sample_id3, size=nrows, replace=True)\n",
+    "    id4 = np.random.choice(K, size=nrows, replace=True)\n",
+    "    id5 = np.random.choice(K, size=nrows, replace=True)\n",
+    "    id6 = np.random.choice(int(N / K), size=nrows, replace=True)\n",
+    "    v1 = np.random.choice(5, size=nrows, replace=True)\n",
+    "    v2 = np.random.choice(15, size=nrows, replace=True)\n",
+    "    v3 = np.random.uniform(0, 100, size=nrows)\n",
+    "\n",
+    "    df = pd.DataFrame(\n",
+    "        dict(\n",
+    "            zip(\n",
+    "                [f\"id{x}\" for x in range(1, 7)] + [\"v1\", \"v2\", \"v3\"],\n",
+    "                [id1, id2, id3, id4, id5, id6, v1, v2, v3],\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "    df.to_csv(\n",
+    "        f\"{dir}/groupby-N_{N}_K_{K}_file_{i}.csv\",\n",
+    "        index=False,\n",
+    "        float_format=\"{:.6f}\".format,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0dd73dc4-9e76-466d-8c7d-48480e37e26a",
+   "metadata": {},
+   "source": [
+    "```r\n",
+    "# split into common (0.9) left (0.1) and right (0.1)\n",
+    "split_xlr = function(n) {\n",
+    "  key = sample.int(n*1.1) # 1.1 = 0.9+0.1+0.1\n",
+    "  list(\n",
+    "    x = key[seq.int(1, n*0.9)],\n",
+    "    l = key[seq.int(n*0.9+1, n)],\n",
+    "    r = key[seq.int(n+1, n*1.1)]\n",
+    "  )\n",
+    "}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "id": "d672d961-1ddb-4681-b2e8-2f18433f4a63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split_xlr(n):\n",
+    "    n = int(n)\n",
+    "    key = np.random.choice(np.arange(1, int(n*1.1) + 1), size=int(n*1.1), replace=False)\n",
+    "    x = key[np.arange(0, int(n*0.9))]\n",
+    "    l = key[np.arange(int(n*0.9), n)]\n",
+    "    r = key[np.arange(n, int(n*1.1))]\n",
+    "    return {\"x\": x, \"l\": l, \"r\": r}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e9a0e72-4a82-451d-9714-2dc256ea31d3",
+   "metadata": {},
+   "source": [
+    "```r\n",
+    "sample_all = function(x, size) {\n",
+    "  stopifnot(length(x) <= size)\n",
+    "  y = c(x, sample(x, size=max(size-length(x), 0), replace=TRUE))\n",
+    "  sample(y)\n",
+    "}\n",
+    "# lhs = ['x', 'l']\n",
+    "id1 = sample_all(unlist(key1[lhs], use.names=FALSE), N),\n",
+    "id2 = sample_all(unlist(key2[lhs], use.names=FALSE), N),\n",
+    "id3 = sample_all(unlist(key3[lhs], use.names=FALSE), N)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "id": "1e94a0ee-4eac-4787-860b-97a0a3ae062e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sample_all(x, n_rows):\n",
+    "    n_rows = int(n_rows)\n",
+    "    assert(len(x) <= n_rows), \"I'm so sad\"\n",
+    "    y = np.append(\n",
+    "        x,\n",
+    "        np.random.choice(x, size=max(n_rows-len(x), 0), replace=True)\n",
+    "    )\n",
+    "    return np.random.choice(y, size=len(y), replace=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "id": "34f3ffd4-145f-47e7-a41b-9c8c0b3fa0f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N = 1e7\n",
+    "\n",
+    "key1 = split_xlr(N/1e6) # 10\n",
+    "key2 = split_xlr(N/1e3) # 10000\n",
+    "key3 = split_xlr(N) # 1e7"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "id": "74cbb8b0-ac1c-4323-986d-d9c6f52034d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "id1 = sample_all(np.append(key1['x'], key1['l']), N)\n",
+    "id2 = sample_all(np.append(key2['x'], key2['l']), N)\n",
+    "id3 = sample_all(np.append(key3['x'], key3['l']), N)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9be793ea-b476-48a7-b786-02ecac3a306b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01bfbf95-3c56-4151-a614-bb53a3398118",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 92bed84dca18c3ffc06b2118b6bb832aabea287c Mon Sep 17 00:00:00 2001
From: ncclementi <natyclementi@gmail.com>
Date: Wed, 12 Oct 2022 18:27:23 -0400
Subject: [PATCH 2/2] cleanup data gen into functions

---
 scripts/convert-join-datagen.ipynb | 192 --------------
 scripts/create_join_data.ipynb     | 395 +++++++++++++++++++++++++++++
 2 files changed, 395 insertions(+), 192 deletions(-)
 delete mode 100644 scripts/convert-join-datagen.ipynb
 create mode 100644 scripts/create_join_data.ipynb

diff --git a/scripts/convert-join-datagen.ipynb b/scripts/convert-join-datagen.ipynb
deleted file mode 100644
index e0f1a48..0000000
--- a/scripts/convert-join-datagen.ipynb
+++ /dev/null
@@ -1,192 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "b3693532-7b3a-44bc-ae50-ea90c940ff2f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_single_df(N, K, nfiles, dir, i):\n",
-    "    \"\"\"\n",
-    "    Creates a single pandas dataframe that contains nrows=N/nfiles\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    N: int,\n",
-    "     Total number of rows\n",
-    "    K: int,\n",
-    "     Number of groups\n",
-    "    nfiles: int,\n",
-    "     Number of output files\n",
-    "    dir: str,\n",
-    "     Output directory\n",
-    "    i: int,\n",
-    "     Integer to assign to the multiple files e.g. range(nfiles)\n",
-    "    \"\"\"\n",
-    "\n",
-    "    nrows = int(N / nfiles)\n",
-    "\n",
-    "    sample_id12 = [f\"id{str(x).zfill(3)}\" for x in range(1, K + 1)]\n",
-    "    sample_id3 = [f\"id{str(x).zfill(10)}\" for x in range(1, int(N / K) + 1)]\n",
-    "\n",
-    "    id1 = np.random.choice(sample_id12, size=nrows, replace=True)\n",
-    "    id2 = np.random.choice(sample_id12, size=nrows, replace=True)\n",
-    "    id3 = np.random.choice(sample_id3, size=nrows, replace=True)\n",
-    "    id4 = np.random.choice(K, size=nrows, replace=True)\n",
-    "    id5 = np.random.choice(K, size=nrows, replace=True)\n",
-    "    id6 = np.random.choice(int(N / K), size=nrows, replace=True)\n",
-    "    v1 = np.random.choice(5, size=nrows, replace=True)\n",
-    "    v2 = np.random.choice(15, size=nrows, replace=True)\n",
-    "    v3 = np.random.uniform(0, 100, size=nrows)\n",
-    "\n",
-    "    df = pd.DataFrame(\n",
-    "        dict(\n",
-    "            zip(\n",
-    "                [f\"id{x}\" for x in range(1, 7)] + [\"v1\", \"v2\", \"v3\"],\n",
-    "                [id1, id2, id3, id4, id5, id6, v1, v2, v3],\n",
-    "            )\n",
-    "        )\n",
-    "    )\n",
-    "\n",
-    "    df.to_csv(\n",
-    "        f\"{dir}/groupby-N_{N}_K_{K}_file_{i}.csv\",\n",
-    "        index=False,\n",
-    "        float_format=\"{:.6f}\".format,\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0dd73dc4-9e76-466d-8c7d-48480e37e26a",
-   "metadata": {},
-   "source": [
-    "```r\n",
-    "# split into common (0.9) left (0.1) and right (0.1)\n",
-    "split_xlr = function(n) {\n",
-    "  key = sample.int(n*1.1) # 1.1 = 0.9+0.1+0.1\n",
-    "  list(\n",
-    "    x = key[seq.int(1, n*0.9)],\n",
-    "    l = key[seq.int(n*0.9+1, n)],\n",
-    "    r = key[seq.int(n+1, n*1.1)]\n",
-    "  )\n",
-    "}\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 97,
-   "id": "d672d961-1ddb-4681-b2e8-2f18433f4a63",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def split_xlr(n):\n",
-    "    n = int(n)\n",
-    "    key = np.random.choice(np.arange(1, int(n*1.1) + 1), size=int(n*1.1), replace=False)\n",
-    "    x = key[np.arange(0, int(n*0.9))]\n",
-    "    l = key[np.arange(int(n*0.9), n)]\n",
-    "    r = key[np.arange(n, int(n*1.1))]\n",
-    "    return {\"x\": x, \"l\": l, \"r\": r}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3e9a0e72-4a82-451d-9714-2dc256ea31d3",
-   "metadata": {},
-   "source": [
-    "```r\n",
-    "sample_all = function(x, size) {\n",
-    "  stopifnot(length(x) <= size)\n",
-    "  y = c(x, sample(x, size=max(size-length(x), 0), replace=TRUE))\n",
-    "  sample(y)\n",
-    "}\n",
-    "# lhs = ['x', 'l']\n",
-    "id1 = sample_all(unlist(key1[lhs], use.names=FALSE), N),\n",
-    "id2 = sample_all(unlist(key2[lhs], use.names=FALSE), N),\n",
-    "id3 = sample_all(unlist(key3[lhs], use.names=FALSE), N)\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 102,
-   "id": "1e94a0ee-4eac-4787-860b-97a0a3ae062e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def sample_all(x, n_rows):\n",
-    "    n_rows = int(n_rows)\n",
-    "    assert(len(x) <= n_rows), \"I'm so sad\"\n",
-    "    y = np.append(\n",
-    "        x,\n",
-    "        np.random.choice(x, size=max(n_rows-len(x), 0), replace=True)\n",
-    "    )\n",
-    "    return np.random.choice(y, size=len(y), replace=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 98,
-   "id": "34f3ffd4-145f-47e7-a41b-9c8c0b3fa0f7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "N = 1e7\n",
-    "\n",
-    "key1 = split_xlr(N/1e6) # 10\n",
-    "key2 = split_xlr(N/1e3) # 10000\n",
-    "key3 = split_xlr(N) # 1e7"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 107,
-   "id": "74cbb8b0-ac1c-4323-986d-d9c6f52034d5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "id1 = sample_all(np.append(key1['x'], key1['l']), N)\n",
-    "id2 = sample_all(np.append(key2['x'], key2['l']), N)\n",
-    "id3 = sample_all(np.append(key3['x'], key3['l']), N)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9be793ea-b476-48a7-b786-02ecac3a306b",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "01bfbf95-3c56-4151-a614-bb53a3398118",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/scripts/create_join_data.ipynb b/scripts/create_join_data.ipynb
new file mode 100644
index 0000000..47fb7f3
--- /dev/null
+++ b/scripts/create_join_data.ipynb
@@ -0,0 +1,395 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f39dbb52",
+   "metadata": {},
+   "source": [
+    "## Create dataframes for join\n",
+    "\n",
+    "There are some notes regarding design for teh R script https://github.com/h2oai/db-benchmark/issues/106\n",
+    "\n",
+    "Originally these are the args you can pass to the R script to create the data:\n",
+    "\n",
+    "```R\n",
+    "N=as.numeric(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])\n",
+    "```\n",
+    "\n",
+    "K -  does not have any effect in the whole script. \n",
+    "nas - Number of NaN. We will ignore case with NaNs for now. \n",
+    "sort - We will work with unsorted data. \n",
+    "\n",
+    "For us, N is the only relevant number. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9125c674",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9b9e9b76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split_xlr(n):\n",
+    "    # split into common (0.9) left (0.1) and right (0.1)\n",
+    "    \n",
+    "    n = int(n)\n",
+    "    key = np.random.choice(np.arange(1, int(n*1.1) + 1), size=int(n*1.1), replace=False)\n",
+    "    x = key[np.arange(0, int(n*0.9))]\n",
+    "    l = key[np.arange(int(n*0.9), n)]\n",
+    "    r = key[np.arange(n, int(n*1.1))]\n",
+    "    return {\"x\": x, \"l\": l, \"r\": r}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "2f454ba8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sample_all(x, n_rows):\n",
+    "    n_rows = int(n_rows)\n",
+    "    assert(len(x) <= n_rows)\n",
+    "    \n",
+    "    y = np.append(\n",
+    "        x,\n",
+    "        np.random.choice(x, size=max(n_rows-len(x), 0), replace=True)\n",
+    "    )\n",
+    "    return np.random.choice(y, size=len(y), replace=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "3a052a76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_str_cols(col):\n",
+    "    new_col = [f\"id{row:.0f}\" for row in col]\n",
+    "    return new_col"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "697d0e6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pretty_num(num):\n",
+    "    return ''.join(f\"{num:.0E}\".split(\"+0\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d9244795",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#generate keys\n",
+    "def generate_keys(N):\n",
+    "    \n",
+    "    key1 = split_xlr(N/1e6) \n",
+    "    key2 = split_xlr(N/1e3) \n",
+    "    key3 = split_xlr(N)\n",
+    "    \n",
+    "    return key1, key2, key3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "2270dbd8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_lhs(N, key1, key2, key3, dir):\n",
+    "    \n",
+    "    N = int(N)\n",
+    "    \n",
+    "    id1 = sample_all(np.append(key1['x'], key1['l']), N)\n",
+    "    id2 = sample_all(np.append(key2['x'], key2['l']), N)\n",
+    "    id3 = sample_all(np.append(key3['x'], key3['l']), N)\n",
+    "    \n",
+    "    id4 = add_str_cols(id1)\n",
+    "    id5 = add_str_cols(id2)\n",
+    "    id6 = add_str_cols(id3)\n",
+    "    \n",
+    "    v1 = np.around(np.random.uniform(0, 100, size=N), decimals=6)\n",
+    "    \n",
+    "    df = pd.DataFrame(\n",
+    "        dict(\n",
+    "            zip(\n",
+    "                [f\"id{x}\" for x in range(1, 7)] + [\"v1\"],\n",
+    "                [id1, id2, id3, id4, id5, id6, v1],\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    "    \n",
+    "#     df.to_csv(\n",
+    "#         f\"{dir}/join-lhs-N_{pretty_num(N)}.csv\",\n",
+    "#         index=False,\n",
+    "#     )\n",
+    "    df.to_parquet(\n",
+    "        f\"{dir}/join-lhs-N_{pretty_num(N)}.parquet\",\n",
+    "        index=False,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "3c713f01",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_rhs_small(N, key1, dir):\n",
+    "    \n",
+    "    n = int(N/1e6)\n",
+    "    \n",
+    "    id1 = sample_all(np.append(key1['x'], key1['r']), n)\n",
+    "    \n",
+    "    id4 = add_str_cols(id1)\n",
+    "    \n",
+    "    v2 = np.around(np.random.uniform(0, 100, size=n), decimals=6)\n",
+    "    \n",
+    "    df = pd.DataFrame(\n",
+    "        dict(\n",
+    "            zip(\n",
+    "                [\"id1\", \"id4\"] + [\"v2\"],\n",
+    "                [id1, id4, v2],\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    "    \n",
+    "    df.to_parquet(\n",
+    "        f\"{dir}/join-rhs-small-N_{pretty_num(N)}-n_{pretty_num(n)}.parquet\",\n",
+    "        index=False,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0fa0aefb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_rhs_medium(N, key1, key2, dir):\n",
+    "    \n",
+    "    n = int(N/1e3)\n",
+    "    \n",
+    "    id1 = sample_all(np.append(key1['x'], key1['r']), n)\n",
+    "    id2 = sample_all(np.append(key2['x'], key2['r']), n)\n",
+    "    \n",
+    "    id4 = add_str_cols(id1)\n",
+    "    id5 = add_str_cols(id2)\n",
+    "    \n",
+    "    v2 = np.around(np.random.uniform(0, 100, size=n), decimals=6)\n",
+    "    \n",
+    "    df = pd.DataFrame(\n",
+    "        dict(\n",
+    "            zip(\n",
+    "                [\"id1\", \"id2\", \"id4\", \"id5\"] + [\"v2\"],\n",
+    "                [id1, id2, id4, id5,  v2],\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    "    \n",
+    "    df.to_parquet(\n",
+    "        f\"{dir}/join-rhs-medium-N_{pretty_num(N)}-n_{pretty_num(n)}.parquet\",\n",
+    "        index=False,\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "5ce658ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_rhs_big(N, key1, key2, key3, dir):\n",
+    "    \n",
+    "    n = int(N/1e0)\n",
+    "    \n",
+    "    id1 = sample_all(np.append(key1['x'], key1['r']), n)\n",
+    "    id2 = sample_all(np.append(key2['x'], key2['r']), n)\n",
+    "    id3 = sample_all(np.append(key3['x'], key3['r']), n)\n",
+    "    \n",
+    "    id4 = add_str_cols(id1)\n",
+    "    id5 = add_str_cols(id2)\n",
+    "    id6 = add_str_cols(id3)\n",
+    "    \n",
+    "    v2 = np.around(np.random.uniform(0, 100, size=n), decimals=6)\n",
+    "    \n",
+    "    df = pd.DataFrame(\n",
+    "        dict(\n",
+    "            zip(\n",
+    "                [f\"id{x}\" for x in range(1, 7)] + [\"v2\"],\n",
+    "                [id1, id2, id3, id4, id5, id6, v2],\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    "    \n",
+    "    df.to_parquet(\n",
+    "        f\"{dir}/join-rhs-big-N_{pretty_num(N)}-n_{pretty_num(n)}.parquet\",\n",
+    "        index=False,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "c3df1c23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dir = \"../test_join_data\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "9d2c498d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N = 1e7"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "83034096",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "key1, key2, key3 = generate_keys(N)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "8a06be3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "create_lhs(N, key1, key2, key3, dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "0112c305",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "create_rhs_small(N, key1, dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "d4b79efa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "create_rhs_medium(N, key1, key2, dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "525474ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "create_rhs_big(N, key1, key2, key3, dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "9ad5b5d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#test = pd.read_parquet(\"../test_join_data/join-rhs-big-N_1E7-n_1E7.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "cb8ce197",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 10000000 entries, 0 to 9999999\n",
+      "Data columns (total 7 columns):\n",
+      " #   Column  Dtype  \n",
+      "---  ------  -----  \n",
+      " 0   id1     int64  \n",
+      " 1   id2     int64  \n",
+      " 2   id3     int64  \n",
+      " 3   id4     object \n",
+      " 4   id5     object \n",
+      " 5   id6     object \n",
+      " 6   v2      float64\n",
+      "dtypes: float64(1), int64(3), object(3)\n",
+      "memory usage: 534.1+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "#test.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "977e610b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}