Skip to content

Commit

Permalink
Merge branch 'main' of github.com:Renumics/sliceguard into main
Browse files Browse the repository at this point in the history
  • Loading branch information
dani2112 committed Jul 25, 2023
2 parents 7e90d12 + 85c321e commit ba2c3d7
Show file tree
Hide file tree
Showing 7 changed files with 450 additions and 644 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,13 @@ sg.report()
- [x] Limited embedding computation for images, audio, text
- [x] Extended embedding support, e.g., more embedding models and allow precomputed embeddings
- [x] Speed up embedding computation using datasets library
- [x] Improved issue detection algorithm, avoiding duplicate detections of similar problems and outliers influencing the segment detection
- [ ] Adaptive drop reference for datasets that contain a wide variety of data
- [ ] Generation of a summary report doing predefined checks
- [ ] Allow for control features in order to account for expected variations when running checks
- [ ] Different interfaces from min_drop, min_support. Maybe n_slices and sort by criterion?
- [ ] Soft Dependencies for embedding computation as torch dependencies are large
- [ ] Improve Spotlight report with embeddings in simmap and histogram for univariate analysis
- [ ] Extensive documentation and examples for common cases
- [ ] Data connectors for faster application on common data formats
- [ ] Improved explanations for found issues, e.g., via SHAP
- [ ] Generation of a summary report doing predefined checks
- [ ] Allow for control features in order to account for expected variations when running checks
- [ ] Improved issue detection algorithm, avoiding duplicate detections of similar problems and outliers influencing the segment detection
- [ ] Large data support for detection and reporting, e.g., 500k audio samples with transcriptions
23 changes: 13 additions & 10 deletions examples/stable_diffusion_evaluation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
"# I just chose the currently most trending prompt dataset on the huggingface hub.\n",
"# Replace that with anything that suits your need better or potentially your own\n",
"# list of potential prompts.\n",
"prompt_dataset = datasets.load_dataset(\"Gustavosta/Stable-Diffusion-Prompts\")"
"prompt_dataset = datasets.load_dataset(\"nateraw/parti-prompts\")"
]
},
{
Expand All @@ -95,7 +95,7 @@
"source": [
"# Generate a bunch of images in the directory \"images\"\n",
"# The prompt dataset is relatively large so it could make sense to stop early.\n",
"target_dir = Path(\"images\")\n",
"target_dir = Path(\"images_parti\")\n",
"if not target_dir.is_dir():\n",
" target_dir.mkdir()\n",
"else:\n",
Expand All @@ -104,6 +104,8 @@
"\n",
"prompts = []\n",
"generated_images = []\n",
"challenges = []\n",
"categories = []\n",
"for prompt in prompt_dataset[\"train\"]:\n",
" try:\n",
" prompt = prompt[\"Prompt\"]\n",
Expand All @@ -117,8 +119,8 @@
" prompts.append(prompt)\n",
" generated_images.append(str(image_path))\n",
"\n",
" df = pd.DataFrame(data={\"image\": generated_images, \"prompt\": prompts})\n",
" df.to_json(\"sd_dataset.json\", orient=\"records\") # save this after every generation to not loose progress in case of crashing\n",
" df = pd.DataFrame(data={\"image\": generated_images, \"prompt\": prompts, \"category\": categories, \"challenge\": challenges})\n",
" df.to_json(\"sd_dataset_parti.json\", orient=\"records\") # save this after every generation to not loose progress in case of crashing\n",
" except:\n",
" print(\"An error occured while generating image.\")"
]
Expand Down Expand Up @@ -170,7 +172,7 @@
"outputs": [],
"source": [
"# Read the dataset generated in the previous step\n",
"df = pd.read_json(\"sd_dataset.json\")"
"df = pd.read_json(\"sd_dataset_parti.json\")"
]
},
{
Expand All @@ -188,6 +190,7 @@
" img = img.convert('RGB')\n",
" np_img = np.array(img)\n",
" clip_score = metric(torch.Tensor(np_img).to(device), row[\"prompt\"]).detach().cpu().numpy()\n",
" clip_score = float(clip_score)\n",
" img.close()\n",
" clip_scores.append(clip_score)\n",
" except Exception as e:\n",
Expand All @@ -206,7 +209,7 @@
"scored_df = pd.concat((df, pd.DataFrame(data={\"clip_score\": clip_scores})), axis=1)\n",
"scored_df = scored_df.dropna()\n",
"scored_df[\"prompt\"] = scored_df[\"prompt\"].astype(\"str\")\n",
"scored_df.to_json(\"sd_dataset_scored.json\", orient=\"records\")"
"scored_df.to_json(\"sd_dataset_scored_parti.json\", orient=\"records\")"
]
},
{
Expand Down Expand Up @@ -241,7 +244,7 @@
"outputs": [],
"source": [
"# Load the dataset\n",
"df = pd.read_json(\"sd_dataset_scored.json\")"
"df = pd.read_json(\"sd_dataset_scored_parti.json\")"
]
},
{
Expand Down Expand Up @@ -286,9 +289,9 @@
"outputs": [],
"source": [
"# Save the new dataset\n",
"df[\"clip_text_embedding\"] = [e.tolist() for e in clip_text_embeddingsxt_embeddings]\n",
"df[\"clip_text_embedding\"] = [e.tolist() for e in clip_text_embeddings]\n",
"df[\"clip_image_embedding\"] = [e.tolist() for e in clip_image_embeddings]\n",
"df.to_json(\"sd_dataset_scored_embedded.json\")"
"df.to_json(\"sd_dataset_scored_embedded_parti.json\")"
]
},
{
Expand Down Expand Up @@ -327,7 +330,7 @@
"outputs": [],
"source": [
"# Load the dataset\n",
"df = pd.read_json(\"sd_dataset_scored_embedded.json\")"
"df = pd.read_json(\"sd_dataset_scored_embedded_parti.json\")"
]
},
{
Expand Down
Loading

0 comments on commit ba2c3d7

Please sign in to comment.