Merge branch 'main' of github.com:Renumics/sliceguard into main

Renumics · Jul 25, 2023 · ba2c3d7 · ba2c3d7
2 parents 7e90d12 + 85c321e
commit ba2c3d7
Show file tree

Hide file tree

Showing 7 changed files with 450 additions and 644 deletions.
diff --git a/README.md b/README.md
@@ -61,12 +61,13 @@ sg.report()
 - [x] Limited embedding computation for images, audio, text
 - [x] Extended embedding support, e.g., more embedding models and allow precomputed embeddings
 - [x] Speed up embedding computation using datasets library
+- [x] Improved issue detection algorithm, avoiding duplicate detections of similar problems and outliers influencing the segment detection
+- [ ] Adaptive drop reference for datasets that contain a wide variety of data
+- [ ] Generation of a summary report doing predefined checks
+- [ ] Allow for control features in order to account for expected variations when running checks
+- [ ] Different interfaces from min_drop, min_support. Maybe n_slices and sort by criterion?
 - [ ] Soft Dependencies for embedding computation as torch dependencies are large
-- [ ] Improve Spotlight report with embeddings in simmap and histogram for univariate analysis
 - [ ] Extensive documentation and examples for common cases
 - [ ] Data connectors for faster application on common data formats
 - [ ] Improved explanations for found issues, e.g., via SHAP
-- [ ] Generation of a summary report doing predefined checks
-- [ ] Allow for control features in order to account for expected variations when running checks
-- [ ] Improved issue detection algorithm, avoiding duplicate detections of similar problems and outliers influencing the segment detection
 - [ ] Large data support for detection and reporting, e.g., 500k audio samples with transcriptions
diff --git a/examples/stable_diffusion_evaluation.ipynb b/examples/stable_diffusion_evaluation.ipynb
@@ -68,7 +68,7 @@
     "# I just chose the currently most trending prompt dataset on the huggingface hub.\n",
     "# Replace that with anything that suits your need better or potentially your own\n",
     "# list of potential prompts.\n",
-    "prompt_dataset = datasets.load_dataset(\"Gustavosta/Stable-Diffusion-Prompts\")"
+    "prompt_dataset = datasets.load_dataset(\"nateraw/parti-prompts\")"
    ]
   },
   {
@@ -95,7 +95,7 @@
    "source": [
     "# Generate a bunch of images in the directory \"images\"\n",
     "# The prompt dataset is relatively large so it could make sense to stop early.\n",
-    "target_dir = Path(\"images\")\n",
+    "target_dir = Path(\"images_parti\")\n",
     "if not target_dir.is_dir():\n",
     "    target_dir.mkdir()\n",
     "else:\n",
@@ -104,6 +104,8 @@
     "\n",
     "prompts = []\n",
     "generated_images = []\n",
+    "challenges = []\n",
+    "categories = []\n",
     "for prompt in prompt_dataset[\"train\"]:\n",
     "    try:\n",
     "        prompt = prompt[\"Prompt\"]\n",
@@ -117,8 +119,8 @@
     "        prompts.append(prompt)\n",
     "        generated_images.append(str(image_path))\n",
     "\n",
-    "        df = pd.DataFrame(data={\"image\": generated_images, \"prompt\": prompts})\n",
-    "        df.to_json(\"sd_dataset.json\", orient=\"records\") # save this after every generation to not loose progress in case of crashing\n",
+    "        df = pd.DataFrame(data={\"image\": generated_images, \"prompt\": prompts, \"category\": categories, \"challenge\": challenges})\n",
+    "        df.to_json(\"sd_dataset_parti.json\", orient=\"records\") # save this after every generation to not loose progress in case of crashing\n",
     "    except:\n",
     "        print(\"An error occured while generating image.\")"
    ]
@@ -170,7 +172,7 @@
    "outputs": [],
    "source": [
     "# Read the dataset generated in the previous step\n",
-    "df = pd.read_json(\"sd_dataset.json\")"
+    "df = pd.read_json(\"sd_dataset_parti.json\")"
    ]
   },
   {
@@ -188,6 +190,7 @@
     "        img = img.convert('RGB')\n",
     "        np_img = np.array(img)\n",
     "        clip_score = metric(torch.Tensor(np_img).to(device), row[\"prompt\"]).detach().cpu().numpy()\n",
+    "        clip_score = float(clip_score)\n",
     "        img.close()\n",
     "        clip_scores.append(clip_score)\n",
     "    except Exception as e:\n",
@@ -206,7 +209,7 @@
     "scored_df = pd.concat((df, pd.DataFrame(data={\"clip_score\": clip_scores})), axis=1)\n",
     "scored_df = scored_df.dropna()\n",
     "scored_df[\"prompt\"] = scored_df[\"prompt\"].astype(\"str\")\n",
-    "scored_df.to_json(\"sd_dataset_scored.json\", orient=\"records\")"
+    "scored_df.to_json(\"sd_dataset_scored_parti.json\", orient=\"records\")"
    ]
   },
   {
@@ -241,7 +244,7 @@
    "outputs": [],
    "source": [
     "# Load the dataset\n",
-    "df = pd.read_json(\"sd_dataset_scored.json\")"
+    "df = pd.read_json(\"sd_dataset_scored_parti.json\")"
    ]
   },
   {
@@ -286,9 +289,9 @@
    "outputs": [],
    "source": [
     "# Save the new dataset\n",
-    "df[\"clip_text_embedding\"] = [e.tolist() for e in clip_text_embeddingsxt_embeddings]\n",
+    "df[\"clip_text_embedding\"] = [e.tolist() for e in clip_text_embeddings]\n",
     "df[\"clip_image_embedding\"] = [e.tolist() for e in clip_image_embeddings]\n",
-    "df.to_json(\"sd_dataset_scored_embedded.json\")"
+    "df.to_json(\"sd_dataset_scored_embedded_parti.json\")"
    ]
   },
   {
@@ -327,7 +330,7 @@
    "outputs": [],
    "source": [
     "# Load the dataset\n",
-    "df = pd.read_json(\"sd_dataset_scored_embedded.json\")"
+    "df = pd.read_json(\"sd_dataset_scored_embedded_parti.json\")"
    ]
   },
   {