seperated questions to process with llm to increase accuracy per input

HarrySu123 · HarrySu123 · commit fa6dd2a3ebe3 · 2025-07-10T17:05:43.000+01:00
diff --git a/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb b/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb
@@ -226,7 +226,7 @@
     "        end_location = fig_name.index(\"?\")\n",
     "        image_name = f\"{idx}_{fig_name[:end_location]}\"\n",
     "        fig_info[\"local_path\"] = image_name\n",
-    "        fig_info[\"image\"].save(f\"{media_path}{fig_info['local_path']}\")\n",
+    "        fig_info[\"image\"].save(f\"{media_path}/{fig_info['local_path']}\")\n",
     "\n",
     "save_figures_to_path(figures)"
    ]
@@ -248,6 +248,8 @@
    "source": [
     "def replace_figures_in_markdown(md_content, figures):\n",
     "    #replace the image URLs in the markdown content with local paths\n",
+    "    # add pictureTag for Lmabda Feedback to recognise it as a picture\n",
+    "    md_content = md_content.replace(\"![]\", \"![pictureTag]\")\n",
     "    for fig_name, fig_info in figures.items():\n",
     "        md_content = md_content.replace(fig_info[\"url\"], fig_info[\"local_path\"])\n",
     "        print(f\"Replaced {fig_info['url']} with {fig_info['local_path']} in markdown content.\")\n",
@@ -300,6 +302,7 @@
     "    # full question and full solution\n",
     "    question_content: str = Field(..., description=\"The content of the question.\")\n",
     "    solution_content: str = Field(..., description=\"The content of the solution.\")\n",
+    "    images: list[str] = Field(..., description=\"A list of image URLs associated with the question.\")\n",
     "\n",
     "class AllQuestionsModel(BaseModel):\n",
     "    name: str = Field(..., description=\"Title of the set\")\n",
@@ -313,7 +316,8 @@
     "        2. Identify the year of the tutorial, if mentioned. Otherwise, use \"0\".\n",
     "        3. Every character should match the original source exactly unless you're instructed to split content into fields, without adding escapes or modifications.\n",
     "        4. Look through the entire markdown:\n",
-    "            - Without ignoring any mentions of images, figures, or other media.\n",
+    "            - Do not neglect any images, figures, or other media mentioned in the question, do not alter or neglect the alt text and the image URL.\n",
+    "            - Leave the Image links and alt text within the question/solution, but also make a copy and place it into the `images` field.\n",
     "            - Identify full Questions, place it into question_content\n",
     "            - Identify the full Worked Solution for each full Question.\n",
     "            - If the Worked Solution is not found, try to find the Answers associated with it instead.\n",
@@ -358,12 +362,6 @@
     "            print(\"LLM response successfully parsed as JSON with questions.\")\n",
     "            # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
     "            return parsed_output.model_dump()\n",
-    "        except ValidationError as ve:\n",
-    "            print(\"❌ Pydantic Validation Error:\")\n",
-    "            for error in ve.errors():\n",
-    "                print(f\" - {error['loc']}: {error['msg']}\")\n",
-    "            print(\"Raw LLM output:\")\n",
-    "            print(response.content)\n",
     "        except Exception as e:\n",
     "            print(\"Error parsing LLM response as JSON:\")\n",
     "            print(\"Retrying...\")\n",
@@ -384,6 +382,7 @@
     "    content: str = Field(..., description=\"Content of the question (no exercise title, no subquestions)\")\n",
     "    parts: list[str] = Field(..., description=\"List of parts within the question (only the text, no numbering)\")\n",
     "    parts_solutions: list[str] = Field(..., description=\"List of worked solutions for the question (no numbering or counting)\")\n",
+    "    images: list[str] = Field(..., description=\"List of image URLs associated with the question (no alt text, only URLs)\")\n",
     "\n",
     "class Set(BaseModel):\n",
     "    name: str = Field(..., description=\"Title of the set\")\n",
@@ -398,15 +397,16 @@
     "    Please follow these steps carefully:\n",
     "        1. Every character should match the original source exactly unless you're instructed to split content into fields, without adding escapes or modifications.\n",
     "        2. Use the same name and year.\n",
-    "        3. For each question in questions:\n",
+    "        3. Use the same list of images as in the input for each question.\n",
+    "        4. For each question in questions:\n",
     "            - Title is the only field where you are allowed to name it whatever you seem fit for the question.\n",
-    "            - Do not neglect any images, figures, or other media mentioned in the question.\n",
+    "            - Do not neglect any images, figures, or other media mentioned in the question, do not alter or neglect the alt text and the image URL.\n",
     "            - Identify the stem and parts of the question, the parts may be obvious to find, like \"a)...\", \"b)...\", etc., or they could be implied by the question itself. All question must have at least one part, if there is only one part. :\n",
     "                1. The stem should be placed into the \"content\" field. Text in this field should be valid in the Milkdown editor. \n",
     "                2. the parts of the question (subquestions) should be placed into the \"parts\" field. Text in this field should be valid under Lexdown.\n",
     "                3. for each part, identify the worked solution/answer and place it into the \"parts_solutions\" field, if not found, leave as empty string, \"\". Text in this field should be valid under Lexdown.\n",
-    "        4. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately, with no code fence or extra text. Use plain newlines (not escaped as `\\n`).\n",
-    "        5. The Text inside the JSON should be in Lexdown:\n",
+    "        5. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately, with no code fence or extra text. Use plain newlines (not escaped as `\\n`).\n",
+    "        6. The Text inside the JSON should be in Lexdown:\n",
     "            1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.\n",
     "            2. Do not remove or collapse blank lines.\n",
     "            3. Do not escape characters like `\\n` or `\\\\`.\n",
@@ -441,49 +441,62 @@
     "              If parsing fails, returns None.\n",
     "    \"\"\"\n",
     "    # Initialize the output parser with the Tutorial schema.\n",
-    "    parser = PydanticOutputParser(pydantic_object=Set)\n",
+    "    parser = PydanticOutputParser(pydantic_object=Set_Question)\n",
     "\n",
-    "    # Construct the prompt, appending the parser's format instructions.\n",
-    "    prompt = f\"\"\"\n",
-    "        Input markdown:\n",
-    "        ```markdown\n",
-    "        {questions_dict}\n",
-    "        ```\n",
     "\n",
-    "        Your task is to extract a JSON with the following structure exactly:\n",
-    "        {parser.get_format_instructions()}\n",
-    "\n",
-    "        {llm_task_seperate_parts}\n",
-    "\n",
-    "        Return the JSON now.\n",
-    "        \"\"\"\n",
     "    \n",
-    "    # tries to call the LLM multiple times to ensure robustness.\n",
-    "    for i in range(3):\n",
-    "        \n",
-    "        # Call the LLM\n",
-    "        response = llm.invoke(prompt)\n",
-    "\n",
-    "        # Debug: print the raw LLM response\n",
-    "        # print(\"Raw LLM Response:\")\n",
-    "        # print(response)\n",
+    "    questions_in_parts = []\n",
+    "    for question in questions_dict[\"questions\"]:\n",
+    "        passed = False\n",
+    "\n",
+    "        for idx in range(3):\n",
+    "\n",
+    "            # Construct the prompt, appending the parser's format instructions.\n",
+    "            prompt = f\"\"\"\n",
+    "                Your task is to extract a JSON with the following structure exactly:\n",
+    "                {parser.get_format_instructions()}\n",
+    "\n",
+    "                {llm_task_seperate_parts}\n",
+    "\n",
+    "                Input JSON:\n",
+    "                ```JSON\n",
+    "                {question}\n",
+    "                ```\n",
+    "\n",
+    "                Return the JSON now.\n",
+    "                \"\"\"\n",
+    "            \n",
+    "            # Call the LLM\n",
+    "            response = llm.invoke(prompt)\n",
+    "\n",
+    "            # Debug: print the raw LLM response\n",
+    "            # print(\"Raw LLM Response:\")\n",
+    "            # print(response)\n",
+    "\n",
+    "            try:\n",
+    "                # Parse the response using the output parser.\n",
+    "                parsed_output = parser.parse(response.content)\n",
+    "                print(f\"LLM response successfully parsed question {idx}.\")\n",
+    "                # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
+    "                passed = True\n",
+    "                break\n",
+    "            except Exception as e:\n",
+    "                print(\"Error parsing LLM response as JSON:\")\n",
+    "                print(\"Retrying...\")\n",
+    "                time.sleep(2)\n",
     "\n",
-    "        try:\n",
-    "            # Parse the response using the output parser.\n",
-    "            parsed_output = parser.parse(response.content)\n",
-    "            print(\"LLM response successfully parsed as JSON with parts.\")\n",
-    "            # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
-    "            return parsed_output.model_dump()\n",
-    "        except ValidationError as ve:\n",
-    "            print(\"❌ Pydantic Validation Error:\")\n",
-    "            for error in ve.errors():\n",
-    "                print(f\" - {error['loc']}: {error['msg']}\")\n",
-    "            print(\"Raw LLM output:\")\n",
-    "            print(response.content)\n",
-    "        except Exception as e:\n",
-    "            print(\"Error parsing LLM response as JSON:\")\n",
-    "            print(\"Retrying...\")\n",
-    "            time.sleep(2)"
+    "        if not passed:\n",
+    "            raise Exception(\"Failed to parse LLM response as JSON after multiple attempts.\")\n",
+    "        \n",
+    "        questions_in_parts.append(parsed_output)\n",
+    "    \n",
+    "    return Set(\n",
+    "        name=questions_dict[\"name\"],\n",
+    "        year=questions_dict[\"year\"],\n",
+    "        questions=questions_in_parts\n",
+    "    ).model_dump()\n",
+    "    \n",
+    "        \n"
    ]
   },
   {
@@ -522,6 +535,9 @@
     "        7. Blank lines:\n",
     "            - Preserve all blank lines inside math blocks.\n",
     "            - Outside math, follow the structure of the original input.\n",
+    "        8. Alt text and image URLs:\n",
+    "            - Ensure that all image URLs and alt text are preserved as they appear in the original input.\n",
+    "            - The alt text must be `pictureTag`.\n",
     "        8. Output format:\n",
     "            - Output a single valid JSON string.\n",
     "            - Do not include any extra characters, explanations, or escaped formatting outside the JSON structure.\n",
@@ -570,12 +586,6 @@
     "            print(\"LLM response successfully parsed as JSON with valid $$.\")\n",
     "            # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
     "            return parsed_output.model_dump()\n",
-    "        except ValidationError as ve:\n",
-    "            print(\"❌ Pydantic Validation Error:\")\n",
-    "            for error in ve.errors():\n",
-    "                print(f\" - {error['loc']}: {error['msg']}\")\n",
-    "            print(\"Raw LLM output:\")\n",
-    "            print(response.content)\n",
     "        except Exception as e:\n",
     "            print(\"Error parsing textdown LLM response as JSON:\")\n",
     "            print(\"Retrying...\")\n",
@@ -665,6 +675,8 @@
     "\n",
     "in2lambda_questions = []\n",
     "\n",
+    "\n",
+    "\n",
     "# Loop over all questions and question_answers and use in2lambda to create a JSON.\n",
     "for idx, question_dict in enumerate(questions, start=1):\n",
     "    parts = []\n",
@@ -678,20 +690,13 @@
     "    question = Question(\n",
     "        title=question_dict.get(\"title\", f\"Question {idx}\"),\n",
     "        main_text=question_dict.get(\"content\", \"\"),\n",
-    "        parts=parts\n",
+    "        parts=parts,\n",
+    "        images=[ f\"{media_path}/{img}\" for img in question_dict.get(\"images\", []) ]\n",
     "    )\n",
     "    in2lambda_questions.append(question)\n",
     "\n",
     "Module(in2lambda_questions).to_json(f\"{output_path}/out\")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "24",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {