Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reproduce dataset for join operation [WIP] #15

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
395 changes: 395 additions & 0 deletions scripts/create_join_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,395 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f39dbb52",
"metadata": {},
"source": [
"## Create dataframes for join\n",
"\n",
"There are some notes regarding design for teh R script https://github.com/h2oai/db-benchmark/issues/106\n",
"\n",
"Originally these are the args you can pass to the R script to create the data:\n",
"\n",
"```R\n",
"N=as.numeric(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])\n",
"```\n",
"\n",
"K - does not have any effect in the whole script. \n",
"nas - Number of NaN. We will ignore case with NaNs for now. \n",
"sort - We will work with unsorted data. \n",
"\n",
"For us, N is the only relevant number. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9125c674",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9b9e9b76",
"metadata": {},
"outputs": [],
"source": [
"def split_xlr(n):\n",
" # split into common (0.9) left (0.1) and right (0.1)\n",
" \n",
" n = int(n)\n",
" key = np.random.choice(np.arange(1, int(n*1.1) + 1), size=int(n*1.1), replace=False)\n",
" x = key[np.arange(0, int(n*0.9))]\n",
" l = key[np.arange(int(n*0.9), n)]\n",
" r = key[np.arange(n, int(n*1.1))]\n",
" return {\"x\": x, \"l\": l, \"r\": r}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2f454ba8",
"metadata": {},
"outputs": [],
"source": [
"def sample_all(x, n_rows):\n",
" n_rows = int(n_rows)\n",
" assert(len(x) <= n_rows)\n",
" \n",
" y = np.append(\n",
" x,\n",
" np.random.choice(x, size=max(n_rows-len(x), 0), replace=True)\n",
" )\n",
" return np.random.choice(y, size=len(y), replace=False)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3a052a76",
"metadata": {},
"outputs": [],
"source": [
"def add_str_cols(col):\n",
" new_col = [f\"id{row:.0f}\" for row in col]\n",
" return new_col"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "697d0e6d",
"metadata": {},
"outputs": [],
"source": [
"def pretty_num(num):\n",
" return ''.join(f\"{num:.0E}\".split(\"+0\"))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d9244795",
"metadata": {},
"outputs": [],
"source": [
"#generate keys\n",
"def generate_keys(N):\n",
" \n",
" key1 = split_xlr(N/1e6) \n",
" key2 = split_xlr(N/1e3) \n",
" key3 = split_xlr(N)\n",
" \n",
" return key1, key2, key3"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "2270dbd8",
"metadata": {},
"outputs": [],
"source": [
"def create_lhs(N, key1, key2, key3, dir):\n",
" \n",
" N = int(N)\n",
" \n",
" id1 = sample_all(np.append(key1['x'], key1['l']), N)\n",
" id2 = sample_all(np.append(key2['x'], key2['l']), N)\n",
" id3 = sample_all(np.append(key3['x'], key3['l']), N)\n",
" \n",
" id4 = add_str_cols(id1)\n",
" id5 = add_str_cols(id2)\n",
" id6 = add_str_cols(id3)\n",
" \n",
" v1 = np.around(np.random.uniform(0, 100, size=N), decimals=6)\n",
" \n",
" df = pd.DataFrame(\n",
" dict(\n",
" zip(\n",
" [f\"id{x}\" for x in range(1, 7)] + [\"v1\"],\n",
" [id1, id2, id3, id4, id5, id6, v1],\n",
" )\n",
" )\n",
" )\n",
" \n",
"# df.to_csv(\n",
"# f\"{dir}/join-lhs-N_{pretty_num(N)}.csv\",\n",
"# index=False,\n",
"# )\n",
" df.to_parquet(\n",
" f\"{dir}/join-lhs-N_{pretty_num(N)}.parquet\",\n",
" index=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "3c713f01",
"metadata": {},
"outputs": [],
"source": [
"def create_rhs_small(N, key1, dir):\n",
" \n",
" n = int(N/1e6)\n",
" \n",
" id1 = sample_all(np.append(key1['x'], key1['r']), n)\n",
" \n",
" id4 = add_str_cols(id1)\n",
" \n",
" v2 = np.around(np.random.uniform(0, 100, size=n), decimals=6)\n",
" \n",
" df = pd.DataFrame(\n",
" dict(\n",
" zip(\n",
" [\"id1\", \"id4\"] + [\"v2\"],\n",
" [id1, id4, v2],\n",
" )\n",
" )\n",
" )\n",
" \n",
" df.to_parquet(\n",
" f\"{dir}/join-rhs-small-N_{pretty_num(N)}-n_{pretty_num(n)}.parquet\",\n",
" index=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "0fa0aefb",
"metadata": {},
"outputs": [],
"source": [
"def create_rhs_medium(N, key1, key2, dir):\n",
" \n",
" n = int(N/1e3)\n",
" \n",
" id1 = sample_all(np.append(key1['x'], key1['r']), n)\n",
" id2 = sample_all(np.append(key2['x'], key2['r']), n)\n",
" \n",
" id4 = add_str_cols(id1)\n",
" id5 = add_str_cols(id2)\n",
" \n",
" v2 = np.around(np.random.uniform(0, 100, size=n), decimals=6)\n",
" \n",
" df = pd.DataFrame(\n",
" dict(\n",
" zip(\n",
" [\"id1\", \"id2\", \"id4\", \"id5\"] + [\"v2\"],\n",
" [id1, id2, id4, id5, v2],\n",
" )\n",
" )\n",
" )\n",
" \n",
" df.to_parquet(\n",
" f\"{dir}/join-rhs-medium-N_{pretty_num(N)}-n_{pretty_num(n)}.parquet\",\n",
" index=False,\n",
" )\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "5ce658ee",
"metadata": {},
"outputs": [],
"source": [
"def create_rhs_big(N, key1, key2, key3, dir):\n",
" \n",
" n = int(N/1e0)\n",
" \n",
" id1 = sample_all(np.append(key1['x'], key1['r']), n)\n",
" id2 = sample_all(np.append(key2['x'], key2['r']), n)\n",
" id3 = sample_all(np.append(key3['x'], key3['r']), n)\n",
" \n",
" id4 = add_str_cols(id1)\n",
" id5 = add_str_cols(id2)\n",
" id6 = add_str_cols(id3)\n",
" \n",
" v2 = np.around(np.random.uniform(0, 100, size=n), decimals=6)\n",
" \n",
" df = pd.DataFrame(\n",
" dict(\n",
" zip(\n",
" [f\"id{x}\" for x in range(1, 7)] + [\"v2\"],\n",
" [id1, id2, id3, id4, id5, id6, v2],\n",
" )\n",
" )\n",
" )\n",
" \n",
" df.to_parquet(\n",
" f\"{dir}/join-rhs-big-N_{pretty_num(N)}-n_{pretty_num(n)}.parquet\",\n",
" index=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "c3df1c23",
"metadata": {},
"outputs": [],
"source": [
"dir = \"../test_join_data\""
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "9d2c498d",
"metadata": {},
"outputs": [],
"source": [
"N = 1e7"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "83034096",
"metadata": {},
"outputs": [],
"source": [
"key1, key2, key3 = generate_keys(N)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "8a06be3e",
"metadata": {},
"outputs": [],
"source": [
"create_lhs(N, key1, key2, key3, dir)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "0112c305",
"metadata": {},
"outputs": [],
"source": [
"create_rhs_small(N, key1, dir)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "d4b79efa",
"metadata": {},
"outputs": [],
"source": [
"create_rhs_medium(N, key1, key2, dir)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "525474ac",
"metadata": {},
"outputs": [],
"source": [
"create_rhs_big(N, key1, key2, key3, dir)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "9ad5b5d1",
"metadata": {},
"outputs": [],
"source": [
"#test = pd.read_parquet(\"../test_join_data/join-rhs-big-N_1E7-n_1E7.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "cb8ce197",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 10000000 entries, 0 to 9999999\n",
"Data columns (total 7 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 id1 int64 \n",
" 1 id2 int64 \n",
" 2 id3 int64 \n",
" 3 id4 object \n",
" 4 id5 object \n",
" 5 id6 object \n",
" 6 v2 float64\n",
"dtypes: float64(1), int64(3), object(3)\n",
"memory usage: 534.1+ MB\n"
]
}
],
"source": [
"#test.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "977e610b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}