diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..b9bf211235b6d4ec294a64aa837252adfa59534b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,25 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +MegaTTS3/assets/Chinese_prompt.wav filter=lfs diff=lfs merge=lfs -text +MegaTTS3/assets/English_prompt.wav filter=lfs diff=lfs merge=lfs -text +MegaTTS3/assets/fig/Hi.gif filter=lfs diff=lfs merge=lfs -text +MegaTTS3/assets/fig/table_tts.png filter=lfs diff=lfs merge=lfs -text +MegaTTS3/assets/fig/table_wavvae.png filter=lfs diff=lfs merge=lfs -text +pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/source.pdf filter=lfs diff=lfs merge=lfs -text +pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_4_Figure_0.jpeg filter=lfs diff=lfs merge=lfs -text +pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/source.pdf filter=lfs diff=lfs merge=lfs -text +pptagent/runs/ppt_video/ca046385-ac3d-4240-9284-a96c57d934d3/output.mp4 filter=lfs diff=lfs merge=lfs -text +pptagent/runs/ppt_video/e88b9f32-6b97-4096-abd6-9bee103524b6/output.mp4 filter=lfs diff=lfs merge=lfs -text +pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/203e2300314026057b7257a3c105a8d2fad5183e.png filter=lfs diff=lfs merge=lfs -text +pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/35639ff12c3127b2ba9419b7c784b212753ff628.png filter=lfs diff=lfs merge=lfs -text +pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/5452ff4f227c6ba1d7ad666974203486e642daf6.png filter=lfs diff=lfs merge=lfs -text +pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/a1c98d25e5c2a3059235733edc58ea6984e75dc9.png filter=lfs diff=lfs merge=lfs -text +pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/fee3a1e81ae1678f114d5799e440cc2b7d740aa1.png filter=lfs diff=lfs merge=lfs -text +pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/source.pptx filter=lfs diff=lfs merge=lfs -text +pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template.pptx filter=lfs diff=lfs merge=lfs -text +pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/source.pptx filter=lfs diff=lfs merge=lfs -text +templates/previews/Template1.jpg filter=lfs diff=lfs merge=lfs -text +templates/Template1.pptx filter=lfs diff=lfs merge=lfs -text +templates/Template2.pptx filter=lfs diff=lfs merge=lfs -text +templates/Template3.pptx filter=lfs diff=lfs merge=lfs -text diff --git a/MegaTTS3/CODE_OF_CONDUCT.md b/MegaTTS3/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..d28f7f91013e84b601278006135c437ab1e0e7f5 --- /dev/null +++ b/MegaTTS3/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/MegaTTS3/Dockerfile b/MegaTTS3/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..6211679afef38a510d4927ae7e0b43606cad8e81 --- /dev/null +++ b/MegaTTS3/Dockerfile @@ -0,0 +1,18 @@ +FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime + +WORKDIR /app + +RUN apt-get update && apt-get install -y \ + curl \ + python3 \ + python3-pip \ + ffmpeg \ + && apt-get clean + +COPY requirements.txt /app/ + +RUN pip install --no-cache-dir -r requirements.txt + +COPY . /app/ + +CMD ["python", "-m", "tts.gradio_api"] diff --git a/MegaTTS3/LICENSE b/MegaTTS3/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..315a3e8c3992b63922a9524fe7f48a4735d46a68 --- /dev/null +++ b/MegaTTS3/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2025] ByteDance + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/MegaTTS3/assets/Chinese_prompt.npy b/MegaTTS3/assets/Chinese_prompt.npy new file mode 100644 index 0000000000000000000000000000000000000000..b96373ba725154fce995657388f14eb3216f108f --- /dev/null +++ b/MegaTTS3/assets/Chinese_prompt.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd32490cefd81aecd7edb36b13dea4e8746ae9365d6aac065e6408477df7e70c +size 27264 diff --git a/MegaTTS3/assets/Chinese_prompt.wav b/MegaTTS3/assets/Chinese_prompt.wav new file mode 100644 index 0000000000000000000000000000000000000000..d3cad0bdfb31866408c6899bda685a20731a587e --- /dev/null +++ b/MegaTTS3/assets/Chinese_prompt.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbb0860cb4dd7c7003b6f0406299fc7c0febc5c6a990e1c670d29b763e84e7ed +size 384046 diff --git a/MegaTTS3/assets/English_prompt.npy b/MegaTTS3/assets/English_prompt.npy new file mode 100644 index 0000000000000000000000000000000000000000..c275020d7afce01af8b91adc7147867a09e8bdd1 --- /dev/null +++ b/MegaTTS3/assets/English_prompt.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a00a5c6bacedeee141d135e38e8d3c297a2326fcbfd9def9a657d4ac603c8b84 +size 22400 diff --git a/MegaTTS3/assets/English_prompt.wav b/MegaTTS3/assets/English_prompt.wav new file mode 100644 index 0000000000000000000000000000000000000000..cabacb48060d5e7219edc84bce1163178c02f5c8 --- /dev/null +++ b/MegaTTS3/assets/English_prompt.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5152e43ef1b2f72c95d64f216179b52d0b68d754785bb85b69ed9111036aa43 +size 317214 diff --git a/MegaTTS3/assets/fig/Hi.gif b/MegaTTS3/assets/fig/Hi.gif new file mode 100644 index 0000000000000000000000000000000000000000..bc5b1b70e07b809fc9e04a1b4d958b1a04b16327 --- /dev/null +++ b/MegaTTS3/assets/fig/Hi.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af4378fa510e5bc152501a0444e312eba7deb35d726fa4bd9567a7a504fb6df8 +size 291627 diff --git a/MegaTTS3/assets/fig/table_tts.png b/MegaTTS3/assets/fig/table_tts.png new file mode 100644 index 0000000000000000000000000000000000000000..26cf13c7055aaf8dcd9151d7e1eea776ad736850 --- /dev/null +++ b/MegaTTS3/assets/fig/table_tts.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cb8a1aee90bcc4de9b90e9883e93733fbfe7d006b018becaf0f58444a45a399 +size 104317 diff --git a/MegaTTS3/assets/fig/table_wavvae.png b/MegaTTS3/assets/fig/table_wavvae.png new file mode 100644 index 0000000000000000000000000000000000000000..fe41ed5984e6177237969001da44098b6ee0e6f8 --- /dev/null +++ b/MegaTTS3/assets/fig/table_wavvae.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:472a8a339388a3f1bd8bed1bfb8c3211a0cbf63974553dabafbcd5ca21178710 +size 152975 diff --git a/MegaTTS3/readme.md b/MegaTTS3/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..1d34a6319501024ccee8dfacedceac50f3361a32 --- /dev/null +++ b/MegaTTS3/readme.md @@ -0,0 +1,194 @@ +
+
+ Official PyTorch Implementation
+
+
+ in the current slide. The most similar paragraph found was `{similar_ele}`.",
+ "This error typically occurs when either: 1) multiple paragraphs are incorrectly merged into a single element, or 2) a single paragraph is incorrectly split into multiple items.",
+ )
+
+ elif element["type"] == "image":
+
+ for caption in element["data"]:
+ for shape in slide.shape_filter(Picture):
+ similarity = edit_distance(shape.caption, caption)
+ if similarity > 0.8:
+ break
+ if similarity > max_similarity:
+ max_similarity = similarity
+ similar_ele = shape.caption
+ else:
+ raise ValueError(
+ f"Image caption of {el_name}: {caption} not found in the `alt` attribute of elements of current slide, the most similar caption is {similar_ele}"
+ )
+
+ else:
+ raise ValueError(
+ f"Unknown type of {el_name}: {element['type']}, should be one of ['text', 'image']"
+ )
+
+
+class SlideInducter:
+ """
+ Stage I: Presentation Analysis.
+ This stage is to analyze the presentation: cluster slides into different layouts, and extract content schema for each layout.
+ """
+
+ def __init__(
+ self,
+ prs: Presentation,
+ ppt_image_folder: str,
+ template_image_folder: str,
+ config: Config,
+ image_models: list,
+ language_model: LLM,
+ vision_model: LLM,
+ use_assert: bool = True,
+ ):
+ """
+ Initialize the SlideInducter.
+
+ Args:
+ prs (Presentation): The presentation object.
+ ppt_image_folder (str): The folder containing PPT images.
+ template_image_folder (str): The folder containing normalized slide images.
+ config (Config): The configuration object.
+ image_models (list): A list of image models.
+ """
+ self.prs = prs
+ self.config = config
+ self.ppt_image_folder = ppt_image_folder
+ self.template_image_folder = template_image_folder
+ self.language_model = language_model
+ self.vision_model = vision_model
+ self.image_models = image_models
+ self.schema_extractor = Agent(
+ "schema_extractor",
+ {
+ "language": language_model,
+ },
+ )
+ if not use_assert:
+ return
+
+ num_template_images = sum(
+ is_image_path(f) for f in os.listdir(template_image_folder)
+ )
+ num_ppt_images = sum(is_image_path(f) for f in os.listdir(ppt_image_folder))
+ num_slides = len(prs.slides)
+
+ if not (num_template_images == num_ppt_images == num_slides):
+ raise ValueError(
+ f"Slide count mismatch detected:\n"
+ f"- Presentation slides: {num_slides}\n"
+ f"- Template images: {num_template_images} ({template_image_folder})\n"
+ f"- PPT images: {num_ppt_images} ({ppt_image_folder})\n"
+ f"All counts must be equal."
+ )
+
+ def layout_induct(self) -> dict:
+ """
+ Perform layout induction for the presentation, should be called before content induction.
+ Return a dict representing layouts, each layout is a dict with keys:
+ - key: the layout name, e.g. "Title and Content:text"
+ - `template_id`: the id of the template slide
+ - `slides`: the list of slide ids
+ Moreover, the dict has a key `functional_keys`, which is a list of functional keys.
+ """
+ layout_induction = defaultdict(lambda: defaultdict(list))
+ content_slides_index, functional_cluster = self.category_split()
+ for layout_name, cluster in functional_cluster.items():
+ layout_induction[layout_name]["slides"] = cluster
+ layout_induction[layout_name]["template_id"] = cluster[0]
+
+ functional_keys = list(layout_induction.keys())
+ function_slides_index = set()
+ for layout_name, cluster in layout_induction.items():
+ function_slides_index.update(cluster["slides"])
+ used_slides_index = function_slides_index.union(content_slides_index)
+ for i in range(len(self.prs.slides)):
+ if i + 1 not in used_slides_index:
+ content_slides_index.add(i + 1)
+ self.layout_split(content_slides_index, layout_induction)
+ layout_induction["functional_keys"] = functional_keys
+ return layout_induction
+
+ def category_split(self):
+ """
+ Split slides into categories based on their functional purpose.
+ """
+ functional_cluster = self.language_model(
+ CATEGORY_SPLIT_TEMPLATE.render(slides=self.prs.to_text()),
+ return_json=True,
+ )
+ assert isinstance(functional_cluster, dict) and all(
+ isinstance(k, str) and isinstance(v, list)
+ for k, v in functional_cluster.items()
+ ), "Functional cluster must be a dictionary with string keys and list values"
+ functional_slides = set(sum(functional_cluster.values(), []))
+ content_slides_index = set(range(1, len(self.prs) + 1)) - functional_slides
+
+ return content_slides_index, functional_cluster
+
+ def layout_split(self, content_slides_index: set[int], layout_induction: dict):
+ """
+ Cluster slides into different layouts.
+ """
+ embeddings = get_image_embedding(self.template_image_folder, *self.image_models)
+ assert len(embeddings) == len(self.prs)
+ content_split = defaultdict(list)
+ for slide_idx in content_slides_index:
+ slide = self.prs.slides[slide_idx - 1]
+ content_type = slide.get_content_type()
+ layout_name = slide.slide_layout_name
+ content_split[(layout_name, content_type)].append(slide_idx)
+
+ for (layout_name, content_type), slides in content_split.items():
+ sub_embeddings = [
+ embeddings[f"slide_{slide_idx:04d}.jpg"] for slide_idx in slides
+ ]
+ similarity = images_cosine_similarity(sub_embeddings)
+ for cluster in get_cluster(similarity):
+ slide_indexs = [slides[i] for i in cluster]
+ template_id = max(
+ slide_indexs,
+ key=lambda x: len(self.prs.slides[x - 1].shapes),
+ )
+ cluster_name = (
+ self.vision_model(
+ ASK_CATEGORY_PROMPT,
+ pjoin(self.ppt_image_folder, f"slide_{template_id:04d}.jpg"),
+ )
+ + ":"
+ + content_type
+ )
+ layout_induction[cluster_name]["template_id"] = template_id
+ layout_induction[cluster_name]["slides"] = slide_indexs
+
+ def content_induct(self, layout_induction: dict):
+ """
+ Perform content schema extraction for the presentation.
+ """
+ for layout_name, cluster in layout_induction.items():
+ if layout_name == "functional_keys" or "content_schema" in cluster:
+ continue
+ slide = self.prs.slides[cluster["template_id"] - 1]
+ turn_id, schema = self.schema_extractor(slide=slide.to_html())
+ schema = self._fix_schema(schema, slide, turn_id)
+ layout_induction[layout_name]["content_schema"] = schema
+
+ return layout_induction
+
+ def _fix_schema(
+ self,
+ schema: dict,
+ slide: SlidePage,
+ turn_id: int = None,
+ retry: int = 0,
+ ) -> dict:
+ """
+ Fix schema by checking and retrying if necessary.
+ """
+ try:
+ check_schema(schema, slide)
+ except ValueError as e:
+ retry += 1
+ logger.debug("Failed at schema extraction: %s", e)
+ if retry == 3:
+ logger.error("Failed to extract schema for slide-%s: %s", turn_id, e)
+ raise e
+ schema = self.schema_extractor.retry(
+ e, traceback.format_exc(), turn_id, retry
+ )
+ return self._fix_schema(schema, slide, turn_id, retry)
+ return schema
+
+
+class SlideInducterAsync(SlideInducter):
+ def __init__(
+ self,
+ prs: Presentation,
+ ppt_image_folder: str,
+ template_image_folder: str,
+ config: Config,
+ image_models: list,
+ language_model: AsyncLLM,
+ vision_model: AsyncLLM,
+ ):
+ """
+ Initialize the SlideInducterAsync with async models.
+
+ Args:
+ prs (Presentation): The presentation object.
+ ppt_image_folder (str): The folder containing PPT images.
+ template_image_folder (str): The folder containing normalized slide images.
+ config (Config): The configuration object.
+ image_models (list): A list of image models.
+ language_model (AsyncLLM): The async language model.
+ vision_model (AsyncLLM): The async vision model.
+ """
+ super().__init__(
+ prs,
+ ppt_image_folder,
+ template_image_folder,
+ config,
+ image_models,
+ language_model,
+ vision_model,
+ )
+ self.language_model = self.language_model.to_async()
+ self.vision_model = self.vision_model.to_async()
+ self.schema_extractor = self.schema_extractor.to_async()
+
+ async def category_split(self):
+ """
+ Async version: Split slides into categories based on their functional purpose.
+ """
+ functional_cluster = await self.language_model(
+ CATEGORY_SPLIT_TEMPLATE.render(slides=self.prs.to_text()),
+ return_json=True,
+ )
+ assert isinstance(functional_cluster, dict) and all(
+ isinstance(k, str) and isinstance(v, list)
+ for k, v in functional_cluster.items()
+ ), "Functional cluster must be a dictionary with string keys and list values"
+ functional_slides = set(sum(functional_cluster.values(), []))
+ content_slides_index = set(range(1, len(self.prs) + 1)) - functional_slides
+
+ return content_slides_index, functional_cluster
+
+ async def layout_split(
+ self, content_slides_index: set[int], layout_induction: dict
+ ):
+ """
+ Async version: Cluster slides into different layouts.
+ """
+ embeddings = get_image_embedding(self.template_image_folder, *self.image_models)
+ assert len(embeddings) == len(self.prs)
+ content_split = defaultdict(list)
+ for slide_idx in content_slides_index:
+ slide = self.prs.slides[slide_idx - 1]
+ content_type = slide.get_content_type()
+ layout_name = slide.slide_layout_name
+ content_split[(layout_name, content_type)].append(slide_idx)
+
+ async with asyncio.TaskGroup() as tg:
+ for (layout_name, content_type), slides in content_split.items():
+ sub_embeddings = [
+ embeddings[f"slide_{slide_idx:04d}.jpg"] for slide_idx in slides
+ ]
+ similarity = images_cosine_similarity(sub_embeddings)
+ for cluster in get_cluster(similarity):
+ slide_indexs = [slides[i] for i in cluster]
+ template_id = max(
+ slide_indexs,
+ key=lambda x: len(self.prs.slides[x - 1].shapes),
+ )
+
+ tg.create_task(
+ self.vision_model(
+ ASK_CATEGORY_PROMPT,
+ pjoin(
+ self.ppt_image_folder, f"slide_{template_id:04d}.jpg"
+ ),
+ )
+ ).add_done_callback(
+ lambda f, tid=template_id, sidxs=slide_indexs, ctype=content_type: layout_induction[
+ f.result() + ":" + ctype
+ ].update(
+ {"template_id": tid, "slides": sidxs}
+ )
+ )
+
+ async def layout_induct(self):
+ """
+ Async version: Perform layout induction for the presentation.
+ """
+ layout_induction = defaultdict(lambda: defaultdict(list))
+ content_slides_index, functional_cluster = await self.category_split()
+ for layout_name, cluster in functional_cluster.items():
+ layout_induction[layout_name]["slides"] = cluster
+ layout_induction[layout_name]["template_id"] = cluster[0]
+
+ functional_keys = list(layout_induction.keys())
+ function_slides_index = set()
+ for layout_name, cluster in layout_induction.items():
+ function_slides_index.update(cluster["slides"])
+ used_slides_index = function_slides_index.union(content_slides_index)
+ for i in range(len(self.prs.slides)):
+ if i + 1 not in used_slides_index:
+ content_slides_index.add(i + 1)
+ await self.layout_split(content_slides_index, layout_induction)
+ layout_induction["functional_keys"] = functional_keys
+ return layout_induction
+
+ async def content_induct(self, layout_induction: dict):
+ """
+ Async version: Perform content schema extraction for the presentation.
+ """
+ async with asyncio.TaskGroup() as tg:
+ for layout_name, cluster in layout_induction.items():
+ if layout_name == "functional_keys" or "content_schema" in cluster:
+ continue
+ slide = self.prs.slides[cluster["template_id"] - 1]
+ coro = self.schema_extractor(slide=slide.to_html())
+
+ tg.create_task(self._fix_schema(coro, slide)).add_done_callback(
+ lambda f, key=layout_name: layout_induction[key].update(
+ {"content_schema": f.result()}
+ )
+ )
+
+ return layout_induction
+
+ async def _fix_schema(
+ self,
+ schema: dict | Coroutine[dict, None, None],
+ slide: SlidePage,
+ turn_id: int = None,
+ retry: int = 0,
+ ):
+ if retry == 0:
+ turn_id, schema = await schema
+ try:
+ check_schema(schema, slide)
+ except ValueError as e:
+ retry += 1
+ logger.debug("Failed at schema extraction: %s", e)
+ if retry == 3:
+ logger.error("Failed to extract schema for slide-%s: %s", turn_id, e)
+ raise e
+ schema = await self.schema_extractor.retry(
+ e, traceback.format_exc(), turn_id, retry
+ )
+ return await self._fix_schema(schema, slide, turn_id, retry)
+ return schema
diff --git a/pptagent/llms.py b/pptagent/llms.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b8e660e29c3eb9075736b92050d502790cc0a2a
--- /dev/null
+++ b/pptagent/llms.py
@@ -0,0 +1,421 @@
+import base64
+import re
+import threading
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from oaib import Auto
+from openai import AsyncOpenAI, OpenAI
+from openai.types.chat import ChatCompletion
+
+from pptagent.utils import get_json_from_response, get_logger, tenacity_decorator
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class LLM:
+ """
+ A wrapper class to interact with a language model.
+ """
+
+ model: str
+ base_url: Optional[str] = None
+ api_key: Optional[str] = None
+ timeout: int = 360
+
+ def __post_init__(self):
+ self.client = OpenAI(
+ base_url=self.base_url, api_key=self.api_key, timeout=self.timeout
+ )
+
+ @tenacity_decorator
+ def __call__(
+ self,
+ content: str,
+ images: Optional[Union[str, list[str]]] = None,
+ system_message: Optional[str] = None,
+ history: Optional[list] = None,
+ return_json: bool = False,
+ return_message: bool = False,
+ **client_kwargs,
+ ) -> Union[str, dict, list, tuple]:
+ """
+ Call the language model with a prompt and optional images.
+
+ Args:
+ content (str): The prompt content.
+ images (str or list[str]): An image file path or list of image file paths.
+ system_message (str): The system message.
+ history (list): The conversation history.
+ return_json (bool): Whether to return the response as JSON.
+ return_message (bool): Whether to return the message.
+ **client_kwargs: Additional keyword arguments to pass to the client.
+
+ Returns:
+ Union[str, Dict, List, Tuple]: The response from the model.
+ """
+ if history is None:
+ history = []
+ system, message = self.format_message(content, images, system_message)
+ try:
+ completion = self.client.chat.completions.create(
+ model=self.model, messages=system + history + message, **client_kwargs
+ )
+ except Exception as e:
+ logger.warning("Error in LLM call: %s", e)
+ raise e
+ response = completion.choices[0].message.content
+ message.append({"role": "assistant", "content": response})
+ return self.__post_process__(response, message, return_json, return_message)
+
+ def __post_process__(
+ self,
+ response: str,
+ message: list,
+ return_json: bool = False,
+ return_message: bool = False,
+ ) -> Union[str, dict, tuple]:
+ """
+ Process the response based on return options.
+
+ Args:
+ response (str): The raw response from the model.
+ message (List): The message history.
+ return_json (bool): Whether to return the response as JSON.
+ return_message (bool): Whether to return the message.
+
+ Returns:
+ Union[str, Dict, Tuple]: Processed response.
+ """
+ response = response.strip()
+ if return_json:
+ response = get_json_from_response(response)
+ if return_message:
+ response = (response, message)
+ return response
+
+ def __repr__(self) -> str:
+ repr_str = f"{self.__class__.__name__}(model={self.model}"
+ if self.base_url is not None:
+ repr_str += f", base_url={self.base_url}"
+ return repr_str + ")"
+
+ def test_connection(self) -> bool:
+ """
+ Test the connection to the LLM.
+
+ Returns:
+ bool: True if connection is successful, False otherwise.
+ """
+ try:
+ self.client.models.list()
+ return True
+ except Exception as e:
+ logger.warning(
+ "Connection test failed: %s\nLLM: %s: %s, %s",
+ e,
+ self.model,
+ self.base_url,
+ self.api_key,
+ )
+ return False
+
+ def format_message(
+ self,
+ content: str,
+ images: Optional[Union[str, list[str]]] = None,
+ system_message: Optional[str] = None,
+ ) -> tuple[list, list]:
+ """
+ Format messages for OpenAI server call.
+
+ Args:
+ content (str): The prompt content.
+ images (str or list[str]): An image file path or list of image file paths.
+ system_message (str): The system message.
+
+ Returns:
+ Tuple[List, List]: Formatted system and user messages.
+ """
+ if isinstance(images, str):
+ images = [images]
+ if system_message is None:
+ if content.startswith("You are"):
+ system_message, content = content.split("\n", 1)
+ else:
+ system_message = "You are a helpful assistant"
+ system = [
+ {
+ "role": "system",
+ "content": [{"type": "text", "text": system_message}],
+ }
+ ]
+ message = [{"role": "user", "content": [{"type": "text", "text": content}]}]
+ if images is not None:
+ for image in images:
+ try:
+ with open(image, "rb") as f:
+ message[0]["content"].append(
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64.b64encode(f.read()).decode('utf-8')}"
+ },
+ }
+ )
+ except Exception as e:
+ logger.error("Failed to load image %s: %s", image, e)
+ return system, message
+
+ def gen_image(self, prompt: str, n: int = 1, **kwargs) -> str:
+ """
+ Generate an image from a prompt.
+ """
+ return (
+ self.client.images.generate(model=self.model, prompt=prompt, n=n, **kwargs)
+ .data[0]
+ .b64_json
+ )
+
+ def get_embedding(
+ self,
+ text: str,
+ encoding_format: str = "float",
+ to_tensor: bool = True,
+ **kwargs,
+ ) -> torch.Tensor | list[float]:
+ """
+ Get the embedding of a text.
+ """
+ result = self.client.embeddings.create(
+ model=self.model, input=text, encoding_format=encoding_format, **kwargs
+ )
+ embeddings = [embedding.embedding for embedding in result.data]
+ if to_tensor:
+ embeddings = torch.tensor(embeddings)
+ return embeddings
+
+ def to_async(self) -> "AsyncLLM":
+ """
+ Convert the LLM to an asynchronous LLM.
+ """
+ return AsyncLLM(
+ model=self.model,
+ base_url=self.base_url,
+ api_key=self.api_key,
+ timeout=self.timeout,
+ )
+
+
+@dataclass
+class AsyncLLM(LLM):
+ use_batch: bool = False
+ """
+ Asynchronous wrapper class for language model interaction.
+ """
+
+ def __post_init__(self):
+ """
+ Initialize the AsyncLLM.
+
+ Args:
+ model (str): The model name.
+ base_url (str): The base URL for the API.
+ api_key (str): API key for authentication. Defaults to environment variable.
+ """
+ self.client = AsyncOpenAI(
+ base_url=self.base_url,
+ api_key=self.api_key,
+ timeout=self.timeout,
+ )
+ if threading.current_thread() == threading.main_thread():
+ self.batch = Auto(
+ base_url=self.base_url,
+ api_key=self.api_key,
+ timeout=self.timeout,
+ loglevel=0,
+ )
+ else:
+ logger.warning("Auto initialization skipped because it's not the main thread.")
+
+ @tenacity_decorator
+ async def __call__(
+ self,
+ content: str,
+ images: Optional[Union[str, list[str]]] = None,
+ system_message: Optional[str] = None,
+ history: Optional[list] = None,
+ return_json: bool = False,
+ return_message: bool = False,
+ **client_kwargs,
+ ) -> Union[str, dict, tuple]:
+ """
+ Asynchronously call the language model with a prompt and optional images.
+
+ Args:
+ content (str): The prompt content.
+ images (str or list[str]): An image file path or list of image file paths.
+ system_message (str): The system message.
+ history (list): The conversation history.
+ return_json (bool): Whether to return the response as JSON.
+ return_message (bool): Whether to return the message.
+ **client_kwargs: Additional keyword arguments to pass to the client.
+
+ Returns:
+ Union[str, Dict, List, Tuple]: The response from the model.
+ """
+ if self.use_batch and threading.current_thread() is threading.main_thread():
+ self.batch = Auto(
+ base_url=self.base_url,
+ api_key=self.api_key,
+ timeout=self.timeout,
+ loglevel=0,
+ )
+ elif self.use_batch:
+ logger.warning(
+ "Warning: AsyncLLM is not running in the main thread, may cause race condition."
+ )
+ if history is None:
+ history = []
+ system, message = self.format_message(content, images, system_message)
+ try:
+ if self.use_batch:
+ await self.batch.add(
+ "chat.completions.create",
+ model=self.model,
+ messages=system + history + message,
+ **client_kwargs,
+ )
+ completion = await self.batch.run()
+ if "result" not in completion or len(completion["result"]) != 1:
+ raise ValueError(
+ f"The length of completion result should be 1, but got {completion}.\nRace condition may have occurred if multiple values are returned.\nOr, there was an error in the LLM call, use the synchronous version to check."
+ )
+ completion = ChatCompletion(**completion["result"][0])
+ else:
+ completion = await self.client.chat.completions.create(
+ model=self.model,
+ messages=system + history + message,
+ **client_kwargs,
+ )
+
+ except Exception as e:
+ logger.warning("Error in AsyncLLM call: %s", e)
+ raise e
+ response = completion.choices[0].message.content
+ message.append({"role": "assistant", "content": response})
+ return self.__post_process__(response, message, return_json, return_message)
+
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ state["client"] = None
+ state["batch"] = None
+ return state
+
+ def __setstate__(self, state: dict):
+ self.__dict__.update(state)
+ self.client = AsyncOpenAI(
+ base_url=self.base_url,
+ api_key=self.api_key,
+ timeout=self.timeout,
+ )
+ self.batch = Auto(
+ base_url=self.base_url,
+ api_key=self.api_key,
+ timeout=self.timeout,
+ loglevel=0,
+ )
+
+ async def test_connection(self) -> bool:
+ """
+ Test the connection to the LLM asynchronously.
+
+ Returns:
+ bool: True if connection is successful, False otherwise.
+ """
+ try:
+ await self.client.models.list()
+ return True
+ except Exception as e:
+ logger.warning(
+ "Async connection test failed: %s\nLLM: %s: %s, %s",
+ e,
+ self.model,
+ self.base_url,
+ self.api_key,
+ )
+ return False
+
+ async def gen_image(self, prompt: str, n: int = 1, **kwargs) -> str:
+ """
+ Generate an image from a prompt asynchronously.
+
+ Args:
+ prompt (str): The text prompt to generate an image from.
+ n (int): Number of images to generate.
+ **kwargs: Additional keyword arguments for image generation.
+
+ Returns:
+ str: Base64-encoded image data.
+ """
+ response = await self.client.images.generate(
+ model=self.model, prompt=prompt, n=n, response_format="b64_json", **kwargs
+ )
+ return response.data[0].b64_json
+
+ async def get_embedding(
+ self,
+ text: str,
+ to_tensor: bool = True,
+ **kwargs,
+ ) -> torch.Tensor | list[float]:
+ """
+ Get the embedding of a text asynchronously.
+
+ Args:
+ text (str): The text to get embeddings for.
+ **kwargs: Additional keyword arguments.
+
+ Returns:
+ List[float]: The embedding vector.
+ """
+ response = await self.client.embeddings.create(
+ model=self.model,
+ input=text,
+ encoding_format="float",
+ **kwargs,
+ )
+ embeddings = [embedding.embedding for embedding in response.data]
+ if to_tensor:
+ embeddings = torch.tensor(embeddings)
+ return embeddings
+
+ def to_sync(self) -> LLM:
+ """
+ Convert the AsyncLLM to a synchronous LLM.
+ """
+ return LLM(model=self.model, base_url=self.base_url, api_key=self.api_key)
+
+
+def get_model_abbr(llms: Union[LLM, list[LLM]]) -> str:
+ """
+ Get abbreviated model names from LLM instances.
+
+ Args:
+ llms: A single LLM instance or a list of LLM instances.
+
+ Returns:
+ str: Abbreviated model names joined with '+'.
+ """
+ # Convert single LLM to list for consistent handling
+ if isinstance(llms, LLM):
+ llms = [llms]
+
+ try:
+ # Attempt to extract model names before version numbers
+ return "+".join(re.search(r"^(.*?)-\d{2}", llm.model).group(1) for llm in llms)
+ except Exception:
+ # Fallback: return full model names if pattern matching fails
+ return "+".join(llm.model for llm in llms)
diff --git a/pptagent/model_utils.py b/pptagent/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..84c07b58dcc6d44d5f868cd99bcadd23b6a84b3f
--- /dev/null
+++ b/pptagent/model_utils.py
@@ -0,0 +1,317 @@
+import json
+import os
+from copy import deepcopy
+from typing import Optional
+
+import numpy as np
+import torch
+import torchvision.transforms as T
+from marker.config.parser import ConfigParser
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+from PIL import Image
+from transformers import AutoModel, AutoProcessor
+
+from pptagent.llms import LLM, AsyncLLM
+from pptagent.presentation import Presentation, SlidePage
+from pptagent.utils import get_logger, is_image_path, pjoin
+
+logger = get_logger(__name__)
+
+
+class ModelManager:
+ """
+ A class to manage models.
+ """
+
+ def __init__(
+ self,
+ api_base: Optional[str] = None,
+ api_key: Optional[str] = None,
+ language_model_name: Optional[str] = None,
+ vision_model_name: Optional[str] = None,
+ text_model_name: Optional[str] = None,
+ ):
+ """Initialize models from environment variables after instance creation"""
+ if api_base is None:
+ api_base = os.environ.get("API_BASE", None)
+ if api_key is None:
+ api_key = os.environ.get("OPENAI_API_KEY", None)
+ if language_model_name is None:
+ language_model_name = os.environ.get("LANGUAGE_MODEL", "gpt-4.1")
+ if vision_model_name is None:
+ vision_model_name = os.environ.get("VISION_MODEL", "gpt-4.1")
+ if text_model_name is None:
+ text_model_name = os.environ.get("TEXT_MODEL", "text-embedding-3-small")
+ self.api_base = api_base
+ self.api_key = api_key
+ self._image_model = None
+ self._marker_model = None
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ self.language_model = AsyncLLM(language_model_name, api_base, api_key=api_key)
+ self.vision_model = AsyncLLM(vision_model_name, api_base, api_key=api_key)
+ self.text_model = AsyncLLM(text_model_name, api_base, api_key=api_key)
+
+ @property
+ def image_model(self):
+ if self._image_model is None:
+ self._image_model = get_image_model(device=self.device)
+ return self._image_model
+
+ @property
+ def marker_model(self):
+ if self._marker_model is None:
+ self._marker_model = create_model_dict(
+ device=self.device, dtype=torch.float16
+ )
+ return self._marker_model
+
+ async def test_connections(self) -> bool:
+ """Test connections for all LLM models
+
+ Returns:
+ bool: True if all connections are successful, False otherwise
+ """
+ try:
+ assert await self.language_model.test_connection()
+ assert await self.vision_model.test_connection()
+ assert await self.text_model.test_connection()
+ except:
+ return False
+ return True
+
+
+def prs_dedup(
+ presentation: Presentation,
+ model: LLM,
+ threshold: float = 0.8,
+) -> list[SlidePage]:
+ """
+ Deduplicate slides in a presentation based on text similarity.
+
+ Args:
+ presentation (Presentation): The presentation object containing slides.
+ model: The model used for generating text embeddings.
+ batchsize (int): The batch size for processing slides.
+ threshold (float): The similarity threshold for deduplication.
+
+ Returns:
+ list: A list of removed duplicate slides.
+ """
+ text_embeddings = model.get_embedding([i.to_text() for i in presentation.slides])
+ pre_embedding = text_embeddings[0]
+ slide_idx = 1
+ duplicates = []
+ while slide_idx < len(presentation):
+ cur_embedding = text_embeddings[slide_idx]
+ if torch.cosine_similarity(pre_embedding, cur_embedding, -1) > threshold:
+ duplicates.append(slide_idx - 1)
+ slide_idx += 1
+ pre_embedding = cur_embedding
+ return [presentation.slides.pop(i) for i in reversed(duplicates)]
+
+
+def get_image_model(device: str = None):
+ """
+ Initialize and return an image model and its feature extractor.
+
+ Args:
+ device (str): The device to run the model on.
+
+ Returns:
+ tuple: A tuple containing the feature extractor and the image model.
+ """
+ model_base = "google/vit-base-patch16-224-in21k"
+ return (
+ AutoProcessor.from_pretrained(
+ model_base,
+ torch_dtype=torch.float16,
+ device_map=device,
+ use_fast=True,
+ ),
+ AutoModel.from_pretrained(
+ model_base,
+ torch_dtype=torch.float16,
+ device_map=device,
+ ).eval(),
+ )
+
+
+def parse_pdf(
+ pdf_path: str,
+ output_path: str,
+ model_lst: list,
+) -> str:
+ """
+ Parse a PDF file and extract text and images.
+
+ Args:
+ pdf_path (str): The path to the PDF file.
+ output_path (str): The directory to save the extracted content.
+ model_lst (list): A list of models for processing the PDF.
+
+ Returns:
+ str: The full text extracted from the PDF.
+ """
+ os.makedirs(output_path, exist_ok=True)
+ config_parser = ConfigParser(
+ {
+ "output_format": "markdown",
+ }
+ )
+ converter = PdfConverter(
+ config=config_parser.generate_config_dict(),
+ artifact_dict=model_lst,
+ processor_list=config_parser.get_processors(),
+ renderer=config_parser.get_renderer(),
+ )
+ rendered = converter(pdf_path)
+ full_text, _, images = text_from_rendered(rendered)
+ with open(pjoin(output_path, "source.md"), "w+", encoding="utf-8") as f:
+ f.write(full_text)
+ for filename, image in images.items():
+ image_filepath = os.path.join(output_path, filename)
+ image.save(image_filepath, "JPEG")
+ with open(pjoin(output_path, "meta.json"), "w+", encoding="utf-8") as f:
+ f.write(json.dumps(rendered.metadata, indent=4))
+
+ return full_text
+
+
+def get_image_embedding(
+ image_dir: str, extractor, model, batchsize: int = 16
+) -> dict[str, torch.Tensor]:
+ """
+ Generate image embeddings for images in a directory.
+
+ Args:
+ image_dir (str): The directory containing images.
+ extractor: The feature extractor for images.
+ model: The model used for generating embeddings.
+ batchsize (int): The batch size for processing images.
+
+ Returns:
+ dict: A dictionary mapping image filenames to their embeddings.
+ """
+ transform = T.Compose(
+ [
+ T.Resize(int((256 / 224) * extractor.size["height"])),
+ T.CenterCrop(extractor.size["height"]),
+ T.ToTensor(),
+ T.Normalize(mean=extractor.image_mean, std=extractor.image_std),
+ ]
+ )
+
+ inputs = []
+ embeddings = []
+ images = [i for i in sorted(os.listdir(image_dir)) if is_image_path(i)]
+ for file in images:
+ image = Image.open(pjoin(image_dir, file)).convert("RGB")
+ inputs.append(transform(image))
+ if len(inputs) % batchsize == 0 or file == images[-1]:
+ batch = {"pixel_values": torch.stack(inputs).to(model.device)}
+ embeddings.extend(model(**batch).last_hidden_state.detach())
+ inputs.clear()
+ return {image: embedding.flatten() for image, embedding in zip(images, embeddings)}
+
+
+def images_cosine_similarity(embeddings: list[torch.Tensor]) -> torch.Tensor:
+ """
+ Calculate the cosine similarity matrix for a list of embeddings.
+ Args:
+ embeddings (list[torch.Tensor]): A list of image embeddings.
+
+ Returns:
+ torch.Tensor: A NxN similarity matrix.
+ """
+ embeddings = [embedding for embedding in embeddings]
+ sim_matrix = torch.zeros((len(embeddings), len(embeddings)))
+ for i in range(len(embeddings)):
+ for j in range(i + 1, len(embeddings)):
+ sim_matrix[i, j] = sim_matrix[j, i] = torch.cosine_similarity(
+ embeddings[i], embeddings[j], -1
+ )
+ return sim_matrix
+
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+def average_distance(
+ similarity: torch.Tensor, idx: int, cluster_idx: list[int]
+) -> float:
+ """
+ Calculate the average distance between a point (idx) and a cluster (cluster_idx).
+
+ Args:
+ similarity (np.ndarray): The similarity matrix.
+ idx (int): The index of the point.
+ cluster_idx (list): The indices of the cluster.
+
+ Returns:
+ float: The average distance.
+ """
+ if idx in cluster_idx:
+ return 0
+ total_similarity = 0
+ for idx_in_cluster in cluster_idx:
+ total_similarity += similarity[idx, idx_in_cluster]
+ return total_similarity / len(cluster_idx)
+
+
+def get_cluster(similarity: np.ndarray, sim_bound: float = 0.65):
+ """
+ Cluster points based on similarity.
+
+ Args:
+ similarity (np.ndarray): The similarity matrix.
+ sim_bound (float): The similarity threshold for clustering.
+
+ Returns:
+ list: A list of clusters.
+ """
+ sim_copy = deepcopy(similarity)
+ num_points = sim_copy.shape[0]
+ clusters = []
+ added = [False] * num_points
+
+ while True:
+ max_avg_dist = sim_bound
+ best_cluster = None
+ best_point = None
+
+ for c in clusters:
+ for point_idx in range(num_points):
+ if added[point_idx]:
+ continue
+ avg_dist = average_distance(sim_copy, point_idx, c)
+ if avg_dist > max_avg_dist:
+ max_avg_dist = avg_dist
+ best_cluster = c
+ best_point = point_idx
+
+ if best_point is not None:
+ best_cluster.append(best_point)
+ added[best_point] = True
+ sim_copy[best_point, :] = 0
+ sim_copy[:, best_point] = 0
+ else:
+ if sim_copy.max() < sim_bound:
+ # append the remaining points invididual cluster
+ for i in range(num_points):
+ if not added[i]:
+ clusters.append([i])
+ break
+ i, j = np.unravel_index(np.argmax(sim_copy), sim_copy.shape)
+ clusters.append([int(i), int(j)])
+ added[i] = True
+ added[j] = True
+ sim_copy[i, :] = 0
+ sim_copy[:, i] = 0
+ sim_copy[j, :] = 0
+ sim_copy[:, j] = 0
+
+ return clusters
diff --git a/pptagent/multimodal.py b/pptagent/multimodal.py
new file mode 100644
index 0000000000000000000000000000000000000000..d605bde9a9fb22a41e3f249b135d9981493e0025
--- /dev/null
+++ b/pptagent/multimodal.py
@@ -0,0 +1,143 @@
+import asyncio
+from typing import Optional
+
+import PIL.Image
+
+from pptagent.llms import LLM, AsyncLLM
+from pptagent.presentation import Picture, Presentation
+from pptagent.utils import Config, get_logger, package_join, pbasename, pjoin
+
+logger = get_logger(__name__)
+
+
+class ImageLabler:
+ """
+ A class to extract images information, including caption, size, and appearance times in a presentation.
+ """
+
+ def __init__(self, presentation: Presentation, config: Config):
+ """
+ Initialize the ImageLabler.
+
+ Args:
+ presentation (Presentation): The presentation object.
+ config (Config): The configuration object.
+ """
+ self.presentation = presentation
+ self.slide_area = presentation.slide_width.pt * presentation.slide_height.pt
+ self.image_stats = {}
+ self.config = config
+ self.collect_images()
+
+ def apply_stats(self, image_stats: Optional[dict[str, dict]] = None):
+ """
+ Apply image captions to the presentation.
+ """
+ if image_stats is None:
+ image_stats = self.image_stats
+
+ for slide in self.presentation.slides:
+ for shape in slide.shape_filter(Picture):
+ if shape.caption is None:
+ caption = image_stats[pbasename(shape.img_path)]["caption"]
+ shape.caption = max(caption.split("\n"), key=len)
+
+ async def caption_images_async(self, vision_model: AsyncLLM):
+ """
+ Generate captions for images in the presentation asynchronously.
+
+ Args:
+ vision_model (AsyncLLM): The async vision model to use for captioning.
+
+ Returns:
+ dict: Dictionary containing image stats with captions.
+ """
+ assert isinstance(
+ vision_model, AsyncLLM
+ ), "vision_model must be an AsyncLLM instance"
+ caption_prompt = open(package_join("prompts", "caption.txt")).read()
+
+ async with asyncio.TaskGroup() as tg:
+ for image, stats in self.image_stats.items():
+ if "caption" not in stats:
+ task = tg.create_task(
+ vision_model(
+ caption_prompt,
+ pjoin(self.config.IMAGE_DIR, image),
+ )
+ )
+ task.add_done_callback(
+ lambda t, image=image: (
+ self.image_stats[image].update({"caption": t.result()}),
+ logger.debug("captioned %s: %s", image, t.result()),
+ )
+ )
+
+ self.apply_stats()
+ return self.image_stats
+
+ def caption_images(self, vision_model: LLM):
+ """
+ Generate captions for images in the presentation.
+
+ Args:
+ vision_model (LLM): The vision model to use for captioning.
+
+ Returns:
+ dict: Dictionary containing image stats with captions.
+ """
+ assert isinstance(vision_model, LLM), "vision_model must be an LLM instance"
+ caption_prompt = open(package_join("prompts", "caption.txt")).read()
+ for image, stats in self.image_stats.items():
+ if "caption" not in stats:
+ stats["caption"] = vision_model(
+ caption_prompt, pjoin(self.config.IMAGE_DIR, image)
+ )
+ logger.debug("captioned %s: %s", image, stats["caption"])
+ self.apply_stats()
+ return self.image_stats
+
+ def collect_images(self):
+ """
+ Collect images from the presentation and gather other information.
+ """
+ for slide_index, slide in enumerate(self.presentation.slides):
+ for shape in slide.shape_filter(Picture):
+ image_path = pbasename(shape.img_path)
+ if image_path == "pic_placeholder.png":
+ continue
+ if image_path not in self.image_stats:
+ size = PIL.Image.open(pjoin(self.config.IMAGE_DIR, image_path)).size
+ self.image_stats[image_path] = {
+ "size": size,
+ "appear_times": 0,
+ "slide_numbers": set(),
+ "relative_area": shape.area / self.slide_area * 100,
+ }
+ self.image_stats[image_path]["appear_times"] += 1
+ self.image_stats[image_path]["slide_numbers"].add(slide_index + 1)
+ for image_path, stats in self.image_stats.items():
+ stats["slide_numbers"] = sorted(list(stats["slide_numbers"]))
+ ranges = self._find_ranges(stats["slide_numbers"])
+ top_ranges = sorted(ranges, key=lambda x: x[1] - x[0], reverse=True)[:3]
+ top_ranges_str = ", ".join(
+ [f"{r[0]}-{r[1]}" if r[0] != r[1] else f"{r[0]}" for r in top_ranges]
+ )
+ stats["top_ranges_str"] = top_ranges_str
+
+ def _find_ranges(self, numbers):
+ """
+ Find consecutive ranges in a list of numbers.
+ """
+ ranges = []
+ start = numbers[0]
+ end = numbers[0]
+ for num in numbers[1:]:
+ if num == end + 1:
+ end = num
+ else:
+ ranges.append((start, end))
+ start = num
+ end = num
+ ranges.append((start, end))
+ return ranges
diff --git a/pptagent/pptgen.py b/pptagent/pptgen.py
new file mode 100644
index 0000000000000000000000000000000000000000..5007dd13c204b09153eff1186cb5ae4a7b1f1bb7
--- /dev/null
+++ b/pptagent/pptgen.py
@@ -0,0 +1,963 @@
+import asyncio
+import json
+import traceback
+from abc import ABC, abstractmethod
+from copy import deepcopy
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+
+from pptagent.agent import Agent
+from pptagent.apis import API_TYPES, CodeExecutor
+from pptagent.document import Document, OutlineItem
+from pptagent.llms import LLM, AsyncLLM
+from pptagent.presentation import Layout, Picture, Presentation, SlidePage, StyleArg
+from pptagent.utils import Config, edit_distance, get_logger, tenacity_decorator
+
+logger = get_logger(__name__)
+
+style = StyleArg.all_true()
+style.area = False
+
+
+class FunctionalLayouts(Enum):
+ OPENING = "opening"
+ TOC = "table of contents"
+ SECTION_OUTLINE = "section outline"
+ ENDING = "ending"
+
+
+FunctionalContent = {
+ FunctionalLayouts.OPENING.value: "This slide is a presentation opening, presenting available meta information, like title, author, date, etc.",
+ FunctionalLayouts.TOC.value: "This slide is the Table of Contents, outlining the presentation's sections. Please use the given Table of Contents, and remove numbering to generate the slide content.",
+ FunctionalLayouts.SECTION_OUTLINE.value: "This slide is a section start , briefly presenting the section title, and optionally the section summary.",
+ FunctionalLayouts.ENDING.value: "This slide is an *ending slide*, simply express your gratitude like 'Thank you!' or '谢谢' as the main title and *do not* include other meta information if not specified.",
+}
+
+
+@dataclass
+class PPTGen(ABC):
+ """
+ Stage II: Presentation Generation
+ An abstract base class for generating PowerPoint presentations.
+ It accepts a reference presentation as input, then generates a presentation outline and slides.
+ """
+
+ roles = []
+ text_embedder: LLM | AsyncLLM
+ language_model: LLM | AsyncLLM
+ vision_model: LLM | AsyncLLM
+ retry_times: int = 3
+ sim_bound: float = 0.5
+ force_pages: bool = False
+ error_exit: bool = False
+ record_cost: bool = False
+ length_factor: float | None = None
+ _initialized: bool = False
+
+ def __post_init__(self):
+ self._initialized = False
+ self._hire_staffs(self.record_cost, self.language_model, self.vision_model)
+ assert (
+ self.length_factor is None or self.length_factor > 0
+ ), "length_factor must be positive or None"
+
+ def set_reference(
+ self,
+ config: Config,
+ slide_induction: dict,
+ presentation: Presentation,
+ hide_small_pic_ratio: Optional[float] = 0.2,
+ keep_in_background: bool = True,
+ ):
+ """
+ Set the reference presentation and extracted presentation information.
+
+ Args:
+ presentation (Presentation): The presentation object.
+ slide_induction (dict): The slide induction data.
+
+ Returns:
+ PPTGen: The updated PPTGen object.
+ """
+ self.config = config
+ self.presentation = presentation
+
+ self.functional_layouts = slide_induction.pop("functional_keys")
+ self.text_layouts = [
+ k
+ for k in slide_induction
+ if k.endswith("text") and k not in self.functional_layouts
+ ]
+ self.multimodal_layouts = [
+ k
+ for k in slide_induction
+ if not k.endswith("text") and k not in self.functional_layouts
+ ]
+ if len(self.text_layouts) == 0:
+ self.text_layouts = self.multimodal_layouts
+ if len(self.multimodal_layouts) == 0:
+ self.multimodal_layouts = self.text_layouts
+
+ self.layouts = {k: Layout.from_dict(k, v) for k, v in slide_induction.items()}
+ self.empty_prs = deepcopy(self.presentation)
+ assert (
+ hide_small_pic_ratio is None or hide_small_pic_ratio > 0
+ ), "hide_small_pic_ratio must be positive or None"
+ if hide_small_pic_ratio is not None:
+ self._hide_small_pics(hide_small_pic_ratio, keep_in_background)
+ self._initialized = True
+ return self
+
+ def generate_pres(
+ self,
+ source_doc: Document,
+ num_slides: Optional[int] = None,
+ outline: Optional[list[OutlineItem]] = None,
+ ):
+ """
+ Generate a PowerPoint presentation.
+
+ Args:
+ source_doc (Document): The source document.
+ num_slides (Optional[int]): The number of slides to generate.
+ outline (Optional[List[OutlineItem]]): The outline of the presentation.
+
+ Returns:
+ dict: A dictionary containing the presentation data and history.
+
+ Raise:
+ ValueError: if failed to generate presentation outline.
+ """
+ assert self._initialized, "PPTGen not initialized, call `set_reference` first"
+ self.source_doc = source_doc
+ succ_flag = True
+ if outline is None:
+ self.outline = self.generate_outline(num_slides, source_doc)
+ else:
+ self.outline = outline
+ self.simple_outline = "\n".join(
+ [
+ f"Slide {slide_idx+1}: {item.purpose}"
+ for slide_idx, item in enumerate(self.outline)
+ ]
+ )
+ generated_slides = []
+ code_executors = []
+ for slide_idx, outline_item in enumerate(self.outline):
+ if self.force_pages and slide_idx == num_slides:
+ break
+ try:
+ slide, code_executor = self.generate_slide(slide_idx, outline_item)
+ generated_slides.append(slide)
+ code_executors.append(code_executor)
+ except Exception as e:
+ logger.warning(
+ "Failed to generate slide, error_exit=%s, error: %s",
+ self.error_exit,
+ str(e),
+ )
+ traceback.print_exc()
+ if self.error_exit:
+ succ_flag = False
+ break
+
+ # Collect history data
+ history = self._collect_history(
+ sum(code_executors, start=CodeExecutor(self.retry_times))
+ )
+
+ if succ_flag:
+ self.empty_prs.slides = generated_slides
+ prs = self.empty_prs
+ else:
+ prs = None
+
+ self.empty_prs = deepcopy(self.presentation)
+ return prs, history
+
+ def generate_outline(
+ self,
+ num_slides: int,
+ source_doc: Document,
+ ):
+ """
+ Generate an outline for the presentation.
+
+ Args:
+ num_slides (int): The number of slides to generate.
+
+ Returns:
+ dict: The generated outline.
+ """
+ assert self._initialized, "PPTGen not initialized, call `set_reference` first"
+ turn_id, outline = self.staffs["planner"](
+ num_slides=num_slides,
+ document_overview=source_doc.get_overview(),
+ )
+ if num_slides == 1 and isinstance(outline, dict):
+ outline = [outline]
+ outline = self._fix_outline(outline, source_doc, turn_id)
+ return self._add_functional_layouts(outline)
+
+ @abstractmethod
+ def generate_slide(
+ self, slide_idx: int, outline_item: OutlineItem
+ ) -> tuple[SlidePage, CodeExecutor]:
+ """
+ Generate a slide from the outline item.
+ """
+ raise NotImplementedError("Subclass must implement this method")
+
+ def _add_functional_layouts(self, outline: list[OutlineItem]):
+ """
+ Add functional layouts to the outline.
+ """
+ toc = []
+ for item in outline:
+ if item.section not in toc and item.section != "Functional":
+ toc.append(item.section)
+ self.toc = "\n".join(toc)
+
+ fixed_functional_slides = [
+ (FunctionalLayouts.TOC.value, 0), # toc should be inserted before opening
+ (FunctionalLayouts.OPENING.value, 0),
+ (FunctionalLayouts.ENDING.value, 999999), # append to the end
+ ]
+ for title, pos in fixed_functional_slides:
+ layout = max(
+ self.functional_layouts,
+ key=lambda x: edit_distance(x.lower(), title),
+ )
+ if edit_distance(layout, title) > 0.7:
+ outline.insert(pos, OutlineItem(title, "Functional", {}, []))
+
+ section_outline = max(
+ self.functional_layouts,
+ key=lambda x: edit_distance(x, FunctionalLayouts.SECTION_OUTLINE.value),
+ )
+ if (
+ not edit_distance(section_outline, FunctionalLayouts.SECTION_OUTLINE.value)
+ > 0.7
+ ):
+ return outline
+ full_outline = []
+ pre_section = None
+ for item in outline:
+ if item.section == "Functional":
+ full_outline.append(item)
+ continue
+ if item.section != pre_section:
+ new_item = OutlineItem(
+ FunctionalLayouts.SECTION_OUTLINE.value,
+ "Functional",
+ item.section,
+ [],
+ )
+ full_outline.append(new_item)
+ full_outline.append(item)
+ pre_section = item.section
+ return full_outline
+
+ def _hide_small_pics(self, area_ratio: float, keep_in_background: bool):
+ for layout in self.layouts.values():
+ template_slide = self.presentation.slides[layout.template_id - 1]
+ pictures = list(template_slide.shape_filter(Picture, return_father=True))
+ if len(pictures) == 0:
+ continue
+ for father, pic in pictures:
+ if pic.area / pic.slide_area < area_ratio:
+ if keep_in_background:
+ father.shapes.remove(pic)
+ else:
+ father.shapes.remove(pic)
+ father.backgrounds.append(pic)
+ layout.remove_item(pic.caption.strip())
+
+ if len(list(template_slide.shape_filter(Picture))) == 0:
+ logger.debug(
+ "All pictures in layout %s are too small, set to pure text layout",
+ layout.title,
+ )
+ layout.title = layout.title.replace(":image", ":text")
+
+ def _fix_outline(
+ self, outline: list[dict], source_doc: Document, turn_id: int, retry: int = 0
+ ) -> list[OutlineItem]:
+ """
+ Validate the generated outline.
+
+ Raises:
+ ValueError: If the outline is invalid.
+ """
+ try:
+ outline_items = [
+ OutlineItem.from_dict(outline_item) for outline_item in outline
+ ]
+ for outline_item in outline_items:
+ outline_item.check_retrieve(source_doc, self.sim_bound)
+ outline_item.check_images(
+ source_doc, self.text_embedder, self.sim_bound
+ )
+ return outline_items
+ except Exception as e:
+ retry += 1
+ logger.info(
+ "Failed to generate outline, tried %d/%d times, error: %s",
+ retry,
+ self.retry_times,
+ str(e),
+ )
+ logger.debug(traceback.format_exc())
+ if retry < self.retry_times:
+ new_outline = self.staffs["planner"].retry(
+ str(e), traceback.format_exc(), turn_id, retry
+ )
+ return self._fix_outline(new_outline, source_doc, turn_id, retry)
+ else:
+ raise ValueError("Failed to generate outline, tried too many times")
+
+ def _collect_history(self, code_executor: CodeExecutor):
+ """
+ Collect the history of code execution, API calls and agent steps.
+
+ Returns:
+ dict: The collected history data.
+ """
+ history = {
+ "agents": {},
+ "code_history": code_executor.code_history,
+ "api_history": code_executor.api_history,
+ }
+
+ for role_name, role in self.staffs.items():
+ history["agents"][role_name] = role.history
+ role._history = []
+
+ return history
+
+ def _hire_staffs(
+ self,
+ record_cost: bool,
+ language_model: LLM | AsyncLLM,
+ vision_model: LLM | AsyncLLM,
+ ) -> dict[str, Agent]:
+ """
+ Initialize agent roles and their models
+ """
+ llm_mapping = {
+ "language": language_model,
+ "vision": vision_model,
+ }
+ self.staffs = {
+ role: Agent(
+ role,
+ record_cost=record_cost,
+ text_model=self.text_embedder,
+ llm_mapping=llm_mapping,
+ )
+ for role in ["planner"] + self.roles
+ }
+
+
+@dataclass
+class PPTGenAsync(PPTGen):
+ """
+ Asynchronous base class for generating PowerPoint presentations.
+ Extends PPTGen with async functionality.
+ """
+
+ def __post_init__(self):
+ super().__post_init__()
+ for k in list(self.staffs.keys()):
+ self.staffs[k] = self.staffs[k].to_async()
+
+ async def generate_pres(
+ self,
+ source_doc: Document,
+ num_slides: Optional[int] = None,
+ outline: Optional[list[OutlineItem]] = None,
+ ):
+ """
+ Asynchronously generate a PowerPoint presentation.
+ """
+ assert (
+ self._initialized
+ ), "AsyncPPTAgent not initialized, call `set_reference` first"
+ self.source_doc = source_doc
+ succ_flag = True
+ if outline is None:
+ self.outline = await self.generate_outline(num_slides, source_doc)
+ else:
+ self.outline = outline
+ self.simple_outline = "\n".join(
+ [
+ f"Slide {slide_idx+1}: {item.purpose}"
+ for slide_idx, item in enumerate(self.outline)
+ ]
+ )
+
+ slide_tasks = []
+ for slide_idx, outline_item in enumerate(self.outline):
+ if self.force_pages and slide_idx == num_slides:
+ break
+ slide_tasks.append(self.generate_slide(slide_idx, outline_item))
+
+ slide_results = await asyncio.gather(*slide_tasks, return_exceptions=True)
+
+ generated_slides = []
+ code_executors = []
+ for result in slide_results:
+ if isinstance(result, Exception):
+ if self.error_exit:
+ succ_flag = False
+ break
+ continue
+ if result is not None:
+ slide, code_executor = result
+ generated_slides.append(slide)
+ code_executors.append(code_executor)
+
+ history = self._collect_history(
+ sum(code_executors, start=CodeExecutor(self.retry_times))
+ )
+
+ if succ_flag:
+ self.empty_prs.slides = generated_slides
+ prs = self.empty_prs
+ else:
+ prs = None
+
+ self.empty_prs = deepcopy(self.presentation)
+ return prs, history
+
+ async def generate_outline(
+ self,
+ num_slides: int,
+ source_doc: Document,
+ ):
+ """
+ Asynchronously generate an outline for the presentation.
+ """
+ assert (
+ self._initialized
+ ), "AsyncPPTAgent not initialized, call `set_reference` first"
+
+ turn_id, outline = await self.staffs["planner"](
+ num_slides=num_slides,
+ document_overview=source_doc.get_overview(),
+ )
+ if num_slides == 1 and isinstance(outline, dict):
+ outline = [outline]
+ outline = await self._fix_outline(outline, source_doc, turn_id)
+ return self._add_functional_layouts(outline)
+
+ @abstractmethod
+ async def generate_slide(
+ self, slide_idx: int, outline_item: OutlineItem
+ ) -> tuple[SlidePage, CodeExecutor]:
+ """
+ Asynchronously generate a slide from the outline item.
+ """
+ raise NotImplementedError("Subclass must implement this method")
+
+ async def _fix_outline(
+ self, outline: list[dict], source_doc: Document, turn_id: int, retry: int = 0
+ ) -> list[OutlineItem]:
+ """
+ Asynchronously validate the generated outline.
+ """
+ try:
+ outline_items = [
+ OutlineItem.from_dict(outline_item) for outline_item in outline
+ ]
+ async with asyncio.TaskGroup() as tg:
+ for outline_item in outline_items:
+ outline_item.check_retrieve(source_doc, self.sim_bound)
+ tg.create_task(
+ outline_item.check_images_async(
+ source_doc, self.text_embedder, self.sim_bound
+ )
+ )
+ return outline_items
+ except Exception as e:
+ retry += 1
+ logger.info(
+ "Failed to generate outline, tried %d/%d times, error: %s",
+ retry,
+ self.retry_times,
+ str(e),
+ )
+ logger.debug(traceback.format_exc())
+ if retry < self.retry_times:
+ new_outline = await self.staffs["planner"].retry(
+ str(e), traceback.format_exc(), turn_id, retry
+ )
+ return await self._fix_outline(new_outline, source_doc, turn_id, retry)
+ else:
+ raise ValueError("Failed to generate outline, tried too many times")
+
+
+class PPTAgent(PPTGen):
+ """
+ A class to generate PowerPoint presentations with a crew of agents.
+ """
+
+ roles: list[str] = [
+ "editor",
+ "coder",
+ "content_organizer",
+ "layout_selector",
+ "notes_generator",
+ ]
+
+ def generate_slide(
+ self, slide_idx: int, outline_item: OutlineItem
+ ) -> tuple[SlidePage, CodeExecutor]:
+ """
+ Generate a slide from the outline item.
+ """
+ if outline_item.section == "Functional":
+ layout = self.layouts[
+ max(
+ self.functional_layouts,
+ key=lambda x: edit_distance(x, outline_item.purpose),
+ )
+ ]
+ slide_desc = FunctionalContent[outline_item.purpose]
+ if outline_item.purpose == FunctionalLayouts.SECTION_OUTLINE.value:
+ outline_item.purpose = f"Section Outline of {outline_item.indexs}"
+ outline_item.indexs = {}
+ slide_content = (
+ "Overview of the Document:\n"
+ + self.source_doc.get_overview(include_summary=True)
+ )
+ elif outline_item.purpose == FunctionalLayouts.TOC.value:
+ slide_content = "Table of Contents:\n" + self.toc
+ else:
+ slide_content = "This slide is a functional layout, please follow the slide description and content schema to generate the slide content."
+ header, _, _ = outline_item.retrieve(slide_idx, self.source_doc)
+ header += slide_desc
+ else:
+ layout, header, slide_content = self._select_layout(slide_idx, outline_item)
+ command_list, template_id = self._generate_content(
+ layout, slide_content, header
+ )
+ notes = self._generate_notes(slide_content, header)
+ slide, code_executor = self._edit_slide(command_list, template_id, notes)
+ slide.slide_notes = self._generate_notes(slide_content, header)
+ return slide, code_executor
+
+ @tenacity_decorator
+ def _select_layout(
+ self, slide_idx: int, outline_item: OutlineItem
+ ) -> tuple[Layout, str, str]:
+ """
+ Select a layout for the slide.
+ """
+ header, content_source, images = outline_item.retrieve(
+ slide_idx, self.source_doc
+ )
+ if len(content_source) == 0:
+ key_points = []
+ else:
+ _, key_points = self.staffs["content_organizer"](
+ content_source=content_source
+ )
+ slide_content = json.dumps(key_points, indent=2, ensure_ascii=False)
+ layouts = self.text_layouts
+ if len(images) > 0:
+ slide_content += "\nImages:\n" + "\n".join(images)
+ layouts = self.multimodal_layouts
+
+ _, layout_selection = self.staffs["layout_selector"](
+ outline=self.simple_outline,
+ slide_description=header,
+ slide_content=slide_content,
+ available_layouts=layouts,
+ )
+ layout = max(
+ self.layouts.keys(),
+ key=lambda x: edit_distance(x, layout_selection["layout"]),
+ )
+ if "image" in layout and len(images) == 0:
+ logger.debug(
+ f"An image layout: {layout} is selected, but no images are provided, please check the parsed document and outline item:\n {outline_item}"
+ )
+ elif "image" not in layout and len(images) > 0:
+ logger.debug(
+ f"A pure text layout: {layout} is selected, but images are provided, please check the parsed document and outline item:\n {outline_item}\n Set images to empty list."
+ )
+ slide_content = slide_content[: slide_content.rfind("\nImages:\n")]
+ return self.layouts[layout], header, slide_content
+
+ def _generate_content(
+ self,
+ layout: Layout,
+ slide_content: str,
+ slide_description: str,
+ ) -> tuple[list, int]:
+ """
+ Synergize Agents to generate a slide.
+
+ Args:
+ layout (Layout): The layout data.
+ slide_content (str): The slide content.
+ slide_description (str): The description of the slide.
+
+ Returns:
+ tuple[list, int]: The generated command list and template id.
+ """
+ turn_id, editor_output = self.staffs["editor"](
+ outline=self.simple_outline,
+ metadata=self.source_doc.metainfo,
+ slide_description=slide_description,
+ slide_content=slide_content,
+ schema=layout.content_schema,
+ )
+ command_list, template_id = self._generate_commands(
+ editor_output, layout, turn_id
+ )
+ return command_list, template_id
+
+ def _generate_notes(
+ self,
+ slide_content: str,
+ slide_description: str,
+ ) -> str:
+ """
+ Generate speaker notes for a slide.
+ """
+ _, notes = self.staffs["notes_generator"](
+ slide_content=slide_content,
+ slide_description=slide_description,
+ )
+ return notes
+ def _edit_slide(
+ self, command_list: list, template_id: int, notes: str
+ ) -> tuple[SlidePage, CodeExecutor]:
+ code_executor = CodeExecutor(self.retry_times)
+ turn_id, edit_actions = self.staffs["coder"](
+ api_docs=code_executor.get_apis_docs(API_TYPES.Agent.value),
+ edit_target=self.presentation.slides[template_id - 1].to_html(),
+ command_list="\n".join([str(i) for i in command_list]),
+ )
+ for error_idx in range(self.retry_times):
+ edit_slide: SlidePage = deepcopy(self.presentation.slides[template_id - 1])
+ feedback = code_executor.execute_actions(
+ edit_actions, edit_slide, self.source_doc
+ )
+ if feedback is None:
+ break
+ logger.info(
+ "Failed to generate slide, tried %d/%d times, error: %s",
+ error_idx + 1,
+ self.retry_times,
+ str(feedback[1]),
+ )
+ logger.debug(traceback.format_exc())
+ if error_idx == self.retry_times:
+ raise Exception(
+ f"Failed to generate slide, tried too many times at editing\ntraceback: {feedback[1]}"
+ )
+ edit_actions = self.staffs["coder"].retry(
+ feedback[0], feedback[1], turn_id, error_idx + 1
+ )
+ self.empty_prs.build_slide(edit_slide)
+ return edit_slide, code_executor
+
+ def _generate_commands(
+ self, editor_output: dict, layout: Layout, turn_id: int, retry: int = 0
+ ):
+ """
+ Generate commands for editing the slide content.
+ """
+ command_list = []
+ try:
+ layout.validate(editor_output, self.source_doc.image_dir)
+ if self.length_factor is not None:
+ layout.validate_length(
+ editor_output, self.length_factor, self.language_model
+ )
+ old_data = layout.get_old_data(editor_output)
+ template_id = layout.get_slide_id(editor_output)
+ except Exception as e:
+ if retry < self.retry_times:
+ new_output = self.staffs["editor"].retry(
+ e,
+ traceback.format_exc(),
+ turn_id,
+ retry + 1,
+ )
+ return self._generate_commands(new_output, layout, turn_id, retry + 1)
+ else:
+ raise Exception(
+ f"Failed to generate commands, tried too many times at editing\ntraceback: {e}"
+ )
+
+ for el_name, old_content in old_data.items():
+ if not isinstance(old_content, list):
+ old_content = [old_content]
+
+ new_content = editor_output.get(el_name, {"data": []})["data"]
+ if not isinstance(new_content, list):
+ new_content = [new_content]
+ new_content = [i for i in new_content if i]
+ quantity_change = len(new_content) - len(old_content)
+ command_list.append(
+ (
+ el_name,
+ layout[el_name].el_type,
+ f"quantity_change: {quantity_change}",
+ old_content,
+ new_content,
+ )
+ )
+
+ assert len(command_list) > 0, "No commands generated"
+ return command_list, template_id
+
+
+class PPTAgentAsync(PPTGenAsync):
+ """
+ Asynchronous version of PPTAgent that uses AsyncAgent for concurrent processing.
+ """
+
+ roles: list[str] = [
+ "editor",
+ "coder",
+ "content_organizer",
+ "layout_selector",
+ "notes_generator",
+ ]
+
+ async def generate_slide(
+ self, slide_idx: int, outline_item: OutlineItem
+ ) -> tuple[SlidePage, CodeExecutor]:
+ """
+ Asynchronously generate a slide from the outline item.
+ """
+ if outline_item.section == "Functional":
+ layout = self.layouts[
+ max(
+ self.functional_layouts,
+ key=lambda x: edit_distance(x.lower(), outline_item.purpose),
+ )
+ ]
+ slide_desc = FunctionalContent[outline_item.purpose]
+ if outline_item.purpose == FunctionalLayouts.SECTION_OUTLINE.value:
+ outline_item.purpose = f"Section Outline of {outline_item.indexs}"
+ outline_item.indexs = {}
+ slide_content = (
+ "Overview of the Document:\n"
+ + self.source_doc.get_overview(include_summary=True)
+ )
+ elif outline_item.purpose == FunctionalLayouts.TOC.value:
+ slide_content = "Table of Contents:\n" + self.toc
+ else:
+ slide_content = "This slide is a functional layout, please follow the slide description and content schema to generate the slide content."
+ header, _, _ = outline_item.retrieve(slide_idx, self.source_doc)
+ header += slide_desc
+ else:
+ layout, header, slide_content = await self._select_layout(
+ slide_idx, outline_item
+ )
+ try:
+ command_list, template_id = await self._generate_content(
+ layout, slide_content, header
+ )
+ notes = await self._generate_notes(slide_content, header)
+ slide, code_executor = await self._edit_slide(command_list, template_id, notes)
+ slide.slide_notes = await self._generate_notes(slide_content, header)
+ except Exception as e:
+ logger.error(f"Failed to generate slide {slide_idx}, error: {e}")
+ traceback.print_exc()
+ raise e
+ return slide, code_executor
+
+ @tenacity_decorator
+ async def _select_layout(
+ self, slide_idx: int, outline_item: OutlineItem
+ ) -> tuple[Layout, str, str]:
+ """
+ Asynchronously select a layout for the slide.
+ """
+ header, content_source, images = outline_item.retrieve(
+ slide_idx, self.source_doc
+ )
+ if len(content_source) == 0:
+ key_points = []
+ else:
+ _, key_points = await self.staffs["content_organizer"](
+ content_source=content_source
+ )
+ slide_content = json.dumps(key_points, indent=2, ensure_ascii=False)
+ layouts = self.text_layouts
+ if len(images) > 0:
+ slide_content += "\nImages:\n" + "\n".join(images)
+ layouts = self.multimodal_layouts
+
+ _, layout_selection = await self.staffs["layout_selector"](
+ outline=self.simple_outline,
+ slide_description=header,
+ slide_content=slide_content,
+ available_layouts=layouts,
+ )
+ layout = max(
+ self.layouts.keys(),
+ key=lambda x: edit_distance(x, layout_selection["layout"]),
+ )
+ if "image" in layout and len(images) == 0:
+ logger.debug(
+ f"An image layout: {layout} is selected, but no images are provided, please check the parsed document and outline item:\n {outline_item}"
+ )
+ elif "image" not in layout and len(images) > 0:
+ logger.debug(
+ f"A pure text layout: {layout} is selected, but images are provided, please check the parsed document and outline item:\n {outline_item}\n Set images to empty list."
+ )
+ slide_content = slide_content[: slide_content.rfind("\nImages:\n")]
+ return self.layouts[layout], header, slide_content
+
+ async def _generate_content(
+ self,
+ layout: Layout,
+ slide_content: str,
+ slide_description: str,
+ ) -> tuple[list, int]:
+ """
+ Synergize Agents to generate a slide.
+
+ Args:
+ layout (Layout): The layout data.
+ slide_content (str): The slide content.
+ slide_description (str): The description of the slide.
+
+ Returns:
+ tuple[list, int]: The generated command list and template id.
+ """
+ turn_id, editor_output = await self.staffs["editor"](
+ outline=self.simple_outline,
+ metadata=self.source_doc.metainfo,
+ slide_description=slide_description,
+ slide_content=slide_content,
+ schema=layout.content_schema,
+ )
+ command_list, template_id = await self._generate_commands(
+ editor_output, layout, turn_id
+ )
+ return command_list, template_id
+
+ async def _generate_notes(
+ self,
+ slide_content: str,
+ slide_description: str,
+ ) -> str:
+ """
+ Generate speaker notes for a slide.
+ """
+ _, notes = await self.staffs["notes_generator"](
+ slide_content=slide_content,
+ slide_description=slide_description,
+ )
+ return notes
+
+ async def _edit_slide(
+ self, command_list: list, template_id: int, notes: str
+ ) -> tuple[SlidePage, CodeExecutor]:
+ """
+ Asynchronously edit the slide.
+ """
+ code_executor = CodeExecutor(self.retry_times)
+ turn_id, edit_actions = await self.staffs["coder"](
+ api_docs=code_executor.get_apis_docs(API_TYPES.Agent.value),
+ edit_target=self.presentation.slides[template_id - 1].to_html(),
+ command_list="\n".join([str(i) for i in command_list]),
+ )
+
+ for error_idx in range(self.retry_times):
+ edit_slide: SlidePage = deepcopy(self.presentation.slides[template_id - 1])
+ feedback = code_executor.execute_actions(
+ edit_actions, edit_slide, self.source_doc
+ )
+ if feedback is None:
+ break
+ logger.info(
+ "Failed to generate slide, tried %d/%d times, error: %s",
+ error_idx + 1,
+ self.retry_times,
+ str(feedback[1]),
+ )
+ if error_idx == self.retry_times:
+ raise Exception(
+ f"Failed to generate slide, tried too many times at editing\ntraceback: {feedback[1]}"
+ )
+ edit_actions = await self.staffs["coder"].retry(
+ feedback[0], feedback[1], turn_id, error_idx + 1
+ )
+ self.empty_prs.build_slide(edit_slide)
+ return edit_slide, code_executor
+
+ async def _generate_commands(
+ self, editor_output: dict, layout: Layout, turn_id: int, retry: int = 0
+ ):
+ """
+ Asynchronously generate commands for editing the slide content.
+
+ Args:
+ editor_output (dict): The editor output.
+ layout (Layout): The layout object containing content schema.
+ turn_id (int): The turn ID for retrying.
+ retry (int, optional): The number of retries. Defaults to 0.
+
+ Returns:
+ list: A list of commands.
+
+ Raises:
+ Exception: If command generation fails.
+ """
+ command_list = []
+ try:
+ layout.validate(editor_output, self.source_doc.image_dir)
+ if self.length_factor is not None:
+ await layout.validate_length_async(
+ editor_output, self.length_factor, self.language_model
+ )
+ old_data = layout.get_old_data(editor_output)
+ template_id = layout.get_slide_id(editor_output)
+ except Exception as e:
+ if retry < self.retry_times:
+ new_output = await self.staffs["editor"].retry(
+ e,
+ traceback.format_exc(),
+ turn_id,
+ retry + 1,
+ )
+ return await self._generate_commands(
+ new_output, layout, turn_id, retry + 1
+ )
+ else:
+ raise Exception(
+ f"Failed to generate commands, tried too many times at editing\ntraceback: {e}"
+ )
+
+ for el_name, old_content in old_data.items():
+ if not isinstance(old_content, list):
+ old_content = [old_content]
+
+ new_content = editor_output.get(el_name, {"data": []})["data"]
+ if not isinstance(new_content, list):
+ new_content = [new_content]
+ new_content = [i for i in new_content if i]
+ quantity_change = len(new_content) - len(old_content)
+ command_list.append(
+ (
+ el_name,
+ layout[el_name].el_type,
+ f"quantity_change: {quantity_change}",
+ old_content,
+ new_content,
+ )
+ )
+
+ assert len(command_list) > 0, "No commands generated"
+ return command_list, template_id
diff --git a/pptagent/presentation/__init__.py b/pptagent/presentation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2f9742fba1ada4051c0b5826e5adc87571f9ff4
--- /dev/null
+++ b/pptagent/presentation/__init__.py
@@ -0,0 +1,46 @@
+from .layout import Layout
+from .presentation import Presentation, SlidePage
+from .shapes import (
+ SHAPECAST,
+ Background,
+ Closure,
+ ClosureType,
+ Fill,
+ Font,
+ FreeShape,
+ GroupShape,
+ Line,
+ Paragraph,
+ Picture,
+ Placeholder,
+ SemanticPicture,
+ ShapeElement,
+ StyleArg,
+ TextBox,
+ TextFrame,
+ UnsupportedShape,
+)
+
+__all__ = [
+ "Presentation",
+ "SlidePage",
+ "SHAPECAST",
+ "Background",
+ "Closure",
+ "ClosureType",
+ "Fill",
+ "Font",
+ "FreeShape",
+ "GroupShape",
+ "Layout",
+ "Line",
+ "Paragraph",
+ "Picture",
+ "Placeholder",
+ "SemanticPicture",
+ "ShapeElement",
+ "StyleArg",
+ "TextBox",
+ "TextFrame",
+ "UnsupportedShape",
+]
diff --git a/pptagent/presentation/__pycache__/__init__.cpython-312.pyc b/pptagent/presentation/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3dfecb3d67b46a83d8f1bb102cd556d4193fab9d
Binary files /dev/null and b/pptagent/presentation/__pycache__/__init__.cpython-312.pyc differ
diff --git a/pptagent/presentation/__pycache__/layout.cpython-312.pyc b/pptagent/presentation/__pycache__/layout.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..659730edc25271c9edabde6fdb92da7d64be86e2
Binary files /dev/null and b/pptagent/presentation/__pycache__/layout.cpython-312.pyc differ
diff --git a/pptagent/presentation/__pycache__/presentation.cpython-312.pyc b/pptagent/presentation/__pycache__/presentation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d0a55d6045a3b3fc78350f504ea16ee48193f72
Binary files /dev/null and b/pptagent/presentation/__pycache__/presentation.cpython-312.pyc differ
diff --git a/pptagent/presentation/__pycache__/shapes.cpython-312.pyc b/pptagent/presentation/__pycache__/shapes.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..735acd7eee42d92d8eb4b736cf7e2441fd7837d3
Binary files /dev/null and b/pptagent/presentation/__pycache__/shapes.cpython-312.pyc differ
diff --git a/pptagent/presentation/layout.py b/pptagent/presentation/layout.py
new file mode 100644
index 0000000000000000000000000000000000000000..f944fc2df2318c45d7740e6a12dfd957d6334f1b
--- /dev/null
+++ b/pptagent/presentation/layout.py
@@ -0,0 +1,234 @@
+import asyncio
+from dataclasses import dataclass
+from typing import Literal, Optional
+
+from jinja2 import StrictUndefined, Template
+
+from pptagent.llms import LLM, AsyncLLM
+from pptagent.utils import get_logger, package_join, pbasename, pexists, pjoin
+
+logger = get_logger(__name__)
+
+LENGTHY_REWRITE_PROMPT = Template(
+ open(package_join("prompts", "lengthy_rewrite.txt")).read(),
+ undefined=StrictUndefined,
+)
+
+
+@dataclass
+class Element:
+ el_name: str
+ content: list[str]
+ description: str
+ el_type: Literal["text", "image"]
+ suggested_characters: int | None
+ variable_length: tuple[int, int] | None
+ variable_data: dict[str, list[str]] | None
+
+ def get_schema(self):
+ schema = f"Element: {self.el_name}\n"
+ base_attrs = ["description", "el_type"]
+ for attr in base_attrs:
+ schema += f"\t{attr}: {getattr(self, attr)}\n"
+ if self.el_type == "text":
+ schema += f"\tsuggested_characters: {self.suggested_characters}\n"
+ if self.variable_length is not None:
+ schema += f"\tThe length of the element can vary between {self.variable_length[0]} and {self.variable_length[1]}\n"
+ schema += f"\tThe default quantity of the element is {len(self.content)}\n"
+ return schema
+
+ @classmethod
+ def from_dict(cls, el_name: str, data: dict):
+ if not isinstance(data["data"], list):
+ data["data"] = [data["data"]]
+ if data["type"] == "text":
+ suggested_characters = max(len(i) for i in data["data"])
+ elif data["type"] == "image":
+ suggested_characters = None
+ return cls(
+ el_name=el_name,
+ el_type=data["type"],
+ content=data["data"],
+ description=data["description"],
+ variable_length=data.get("variableLength", None),
+ variable_data=data.get("variableData", None),
+ suggested_characters=suggested_characters,
+ )
+
+
+@dataclass
+class Layout:
+ title: str
+ template_id: int
+ slides: list[int]
+ elements: list[Element]
+ vary_mapping: dict[int, int] | None # mapping for variable elements
+
+ @classmethod
+ def from_dict(cls, title: str, data: dict):
+ elements = [
+ Element.from_dict(el_name, data["content_schema"][el_name])
+ for el_name in data["content_schema"]
+ ]
+ num_vary_elements = sum((el.variable_length is not None) for el in elements)
+ if num_vary_elements > 1:
+ raise ValueError("Only one variable element is allowed")
+ return cls(
+ title=title,
+ template_id=data["template_id"],
+ slides=data["slides"],
+ elements=elements,
+ vary_mapping=data.get("vary_mapping", None),
+ )
+
+ def get_slide_id(self, data: dict):
+ for el in self.elements:
+ if el.variable_length is not None:
+ num_vary = len(data[el.el_name]["data"])
+ if num_vary < el.variable_length[0]:
+ raise ValueError(
+ f"The length of {el.el_name}: {num_vary} is less than the minimum length: {el.variable_length[0]}"
+ )
+ if num_vary > el.variable_length[1]:
+ raise ValueError(
+ f"The length of {el.el_name}: {num_vary} is greater than the maximum length: {el.variable_length[1]}"
+ )
+ return self.vary_mapping[str(num_vary)]
+ return self.template_id
+
+ def get_old_data(self, editor_output: Optional[dict] = None):
+ if editor_output is None:
+ return {el.el_name: el.content for el in self.elements}
+ old_data = {}
+ for el in self.elements:
+ if el.variable_length is not None:
+ key = str(len(editor_output[el.el_name]["data"]))
+ assert (
+ key in el.variable_data
+ ), f"The length of element {el.el_name} varies between {el.variable_length[0]} and {el.variable_length[1]}, but got data of length {key} which is not supported"
+ old_data[el.el_name] = el.variable_data[key]
+ else:
+ old_data[el.el_name] = el.content
+ return old_data
+
+ def validate(self, editor_output: dict, image_dir: str):
+ for el_name, el_data in editor_output.items():
+ assert (
+ "data" in el_data
+ ), """key `data` not found in output
+ please give your output as a dict like
+ {
+ "element1": {
+ "data": ["text1", "text2"] for text elements
+ or ["/path/to/image", "..."] for image elements
+ },
+ }"""
+ assert (
+ el_name in self
+ ), f"Element {el_name} is not a valid element, supported elements are {[el.el_name for el in self.elements]}"
+ if self[el_name].el_type == "image":
+ for i in range(len(el_data["data"])):
+ if pexists(pjoin(image_dir, el_data["data"][i])):
+ el_data["data"][i] = pjoin(image_dir, el_data["data"][i])
+ if not pexists(el_data["data"][i]):
+ basename = pbasename(el_data["data"][i])
+ if pexists(pjoin(image_dir, basename)):
+ el_data["data"][i] = pjoin(image_dir, basename)
+ else:
+ raise ValueError(
+ f"Image {el_data['data'][i]} not found\n"
+ "Please check the image path and use only existing images\n"
+ "Or, leave a blank list for this element"
+ )
+
+ def validate_length(
+ self, editor_output: dict, length_factor: float, language_model: LLM
+ ):
+ for el_name, el_data in editor_output.items():
+ if self[el_name].el_type == "text":
+ charater_counts = [len(i) for i in el_data["data"]]
+ if (
+ max(charater_counts)
+ > self[el_name].suggested_characters * length_factor
+ ):
+ el_data["data"] = language_model(
+ LENGTHY_REWRITE_PROMPT.render(
+ el_name=el_name,
+ content=el_data["data"],
+ suggested_characters=f"{self[el_name].suggested_characters} characters",
+ ),
+ return_json=True,
+ )
+ assert isinstance(
+ el_data["data"], list
+ ), f"Generated data is lengthy, expect {self[el_name].suggested_characters} characters, but got {len(el_data['data'])} characters for element {el_name}"
+
+ async def validate_length_async(
+ self, editor_output: dict, length_factor: float, language_model: AsyncLLM
+ ):
+ async with asyncio.TaskGroup() as tg:
+ tasks = {}
+ for el_name, el_data in editor_output.items():
+ if self[el_name].el_type == "text":
+ charater_counts = [len(i) for i in el_data["data"]]
+ if (
+ max(charater_counts)
+ > self[el_name].suggested_characters * length_factor
+ ):
+ task = tg.create_task(
+ language_model(
+ LENGTHY_REWRITE_PROMPT.render(
+ el_name=el_name,
+ content=el_data["data"],
+ suggested_characters=f"{self[el_name].suggested_characters} characters",
+ ),
+ return_json=True,
+ )
+ )
+ tasks[el_name] = task
+
+ for el_name, task in tasks.items():
+ assert isinstance(
+ editor_output[el_name]["data"], list
+ ), f"Generated data is lengthy, expect {self[el_name].suggested_characters} characters, but got {len(editor_output[el_name]['data'])} characters for element {el_name}"
+ new_data = await task
+ logger.debug(
+ f"Lengthy rewrite for {el_name}:\n From {editor_output[el_name]['data']}\n To {new_data}"
+ )
+ editor_output[el_name]["data"] = new_data
+
+ @property
+ def content_schema(self):
+ return "\n".join([el.get_schema() for el in self.elements])
+
+ def remove_item(self, item: str):
+ for el in self.elements:
+ if item in el.content:
+ el.content.remove(item)
+ if len(el.content) == 0:
+ self.elements.remove(el)
+ return
+ else:
+ raise ValueError(f"Item {item} not found in layout {self.title}")
+
+ def __contains__(self, key: str | int):
+ if isinstance(key, int):
+ return key in self.slides
+ elif isinstance(key, str):
+ for el in self.elements:
+ if el.el_name == key:
+ return True
+ return False
+ raise ValueError(f"Invalid key type: {type(key)}, should be str or int")
+
+ def __getitem__(self, key: str):
+ for el in self.elements:
+ if el.el_name == key:
+ return el
+ raise ValueError(f"Element {key} not found")
+
+ def __iter__(self):
+ return iter(self.elements)
+
+ def __len__(self):
+ return len(self.elements)
diff --git a/pptagent/presentation/presentation.py b/pptagent/presentation/presentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1cd825101e2663e6b758e4e02710a12e98e5f0c
--- /dev/null
+++ b/pptagent/presentation/presentation.py
@@ -0,0 +1,481 @@
+import traceback
+from collections.abc import Generator
+from typing import Literal, Optional
+
+from pptx import Presentation as load_prs
+from pptx.enum.shapes import MSO_SHAPE_TYPE
+from pptx.shapes.base import BaseShape
+from pptx.shapes.group import GroupShape as PPTXGroupShape
+from pptx.slide import Slide as PPTXSlide
+
+from pptagent.utils import Config, get_logger, package_join
+
+from .shapes import (
+ Background,
+ GroupShape,
+ Paragraph,
+ Picture,
+ ShapeElement,
+ StyleArg,
+)
+
+# Type variable for ShapeElement subclasses
+
+logger = get_logger(__name__)
+
+
+class SlidePage:
+ """
+ A class to represent a slide page in a presentation.
+ """
+
+ def __init__(
+ self,
+ shapes: list[ShapeElement],
+ backgrounds: list[Background],
+ slide_idx: int,
+ real_idx: int,
+ slide_notes: Optional[str],
+ slide_layout_name: Optional[str],
+ slide_title: Optional[str],
+ slide_width: int,
+ slide_height: int,
+ ):
+ """
+ Initialize a SlidePage.
+
+ Args:
+ shapes (List[ShapeElement]): The shapes in the slide.
+ backgrounds (List[Background]): The backgrounds of the slide.
+ slide_idx (int): The index of the slide.
+ real_idx (int): The real index of the slide.
+ slide_notes (Optional[str]): The notes of the slide.
+ slide_layout_name (Optional[str]): The layout name of the slide.
+ slide_title (Optional[str]): The title of the slide.
+ slide_width (int): The width of the slide.
+ slide_height (int): The height of the slide.
+ """
+ self.shapes = shapes
+ self.backgrounds = backgrounds
+ self.slide_idx = slide_idx
+ self.real_idx = real_idx
+ self.slide_notes = slide_notes
+ self.slide_layout_name = slide_layout_name
+ self.slide_title = slide_title
+ self.slide_width = slide_width
+ self.slide_height = slide_height
+
+ # Assign group labels to group shapes
+ groups_shapes_labels = []
+ for shape in self.shape_filter(GroupShape):
+ for group_shape in groups_shapes_labels:
+ if group_shape == shape:
+ shape.group_label = group_shape.group_label
+ continue
+ groups_shapes_labels.append(shape)
+ shape.group_label = f"group_{len(groups_shapes_labels)}"
+
+ @classmethod
+ def from_slide(
+ cls,
+ slide: PPTXSlide,
+ slide_idx: int,
+ real_idx: int,
+ slide_width: int,
+ slide_height: int,
+ config: Config,
+ shape_cast: dict[MSO_SHAPE_TYPE, type[ShapeElement] | None],
+ ) -> "SlidePage":
+ """
+ Create a SlidePage from a PPTXSlide.
+
+ Args:
+ slide (PPTXSlide): The slide object.
+ slide_idx (int): The index of the slide.
+ real_idx (int): The real index of the slide.
+ slide_width (int): The width of the slide.
+ slide_height (int): The height of the slide.
+ config (Config): The configuration object.
+ shape_cast (dict[MSO_SHAPE_TYPE, type[ShapeElement] | None]): Mapping of shape types to their corresponding ShapeElement classes.
+ Set the value to None for any MSO_SHAPE_TYPE to exclude that shape type from processing.
+ Returns:
+ SlidePage: The created SlidePage.
+ """
+ backgrounds = [Background.from_slide(slide, config)]
+ shapes = []
+ for i, shape in enumerate(slide.shapes):
+ if not shape.visible:
+ continue
+ if shape_cast.get(shape.shape_type, -1) is None:
+ continue
+ shapes.append(
+ ShapeElement.from_shape(
+ slide_idx, i, shape, config, slide_width * slide_height, shape_cast
+ )
+ )
+ for i, s in enumerate(shapes):
+ if isinstance(s, Picture) and s.area / s.slide_area > 0.95:
+ backgrounds.append(shapes.pop(i))
+
+ slide_layout_name = slide.slide_layout.name if slide.slide_layout else None
+ slide_title = slide.shapes.title.text if slide.shapes.title else None
+ slide_notes = (
+ slide.notes_slide.notes_text_frame.text
+ if slide.has_notes_slide and slide.notes_slide.notes_text_frame
+ else None
+ )
+
+ return cls(
+ shapes,
+ backgrounds,
+ slide_idx,
+ real_idx,
+ slide_notes,
+ slide_layout_name,
+ slide_title,
+ slide_width,
+ slide_height,
+ )
+
+ def build(self, slide: PPTXSlide) -> PPTXSlide:
+ """
+ Build the slide page in a slide.
+
+ Args:
+ slide (PPTXSlide): The slide to build the slide page in.
+
+ Returns:
+ PPTXSlide: The built slide.
+ """
+ # Remove existing placeholders
+ for ph in slide.placeholders:
+ ph.element.getparent().remove(ph.element)
+
+ # Build backgrounds, shapes and apply closures
+ for shape in sorted(self.backgrounds + self.shapes, key=lambda x: x.shape_idx):
+ build_shape = shape.build(slide)
+ for closure in shape.closures:
+ try:
+ closure.apply(build_shape)
+ except Exception as e:
+ raise ValueError(f"Failed to apply closures to slides: {e}")
+ return slide
+
+ def iter_paragraphs(self) -> Generator[Paragraph, None, None]:
+ for shape in self: # this considered the group shapes
+ if not shape.text_frame.is_textframe:
+ continue
+ for para in shape.text_frame.paragraphs:
+ if para.idx != -1:
+ yield para
+
+ def shape_filter(
+ self,
+ shape_type: type[ShapeElement],
+ from_groupshape: bool = True,
+ return_father: bool = False,
+ ) -> (
+ Generator[ShapeElement, None, None]
+ | Generator[tuple["SlidePage", ShapeElement], None, None]
+ ):
+ """
+ Filter shapes in the slide by type.
+
+ Args:
+ shape_type (Type[ShapeElement]): The type of shapes to filter.
+ shapes (Optional[List[ShapeElement]]): The shapes to filter.
+
+ Yields:
+ ShapeElement: The filtered shapes.
+ """
+ for shape in self.shapes:
+ if isinstance(shape, shape_type):
+ if return_father:
+ yield (self, shape)
+ else:
+ yield shape
+ elif from_groupshape and isinstance(shape, GroupShape):
+ yield from shape.shape_filter(shape_type, return_father)
+
+ def get_content_type(self) -> Literal["text", "image"]:
+ """
+ Get the content type of the slide.
+
+ Returns:
+ Literal["text", "image"]: The content type of the slide.
+ """
+ if len(list(self.shape_filter(Picture))) == 0:
+ return "text"
+ return "image"
+
+ def to_html(self, style_args: Optional[StyleArg] = None, **kwargs) -> str:
+ """
+ Represent the slide page in HTML.
+
+ Args:
+ style_args (Optional[StyleArg]): The style arguments for HTML conversion.
+ **kwargs: Additional arguments.
+
+ Returns:
+ str: The HTML representation of the slide page.
+ """
+ if style_args is None:
+ style_args = StyleArg(**kwargs)
+ shapes_html = [shape.to_html(style_args) for shape in self.shapes]
+ shapes_html = [html for html in shapes_html if html]
+ return "".join(
+ [
+ "\n\n",
+ (f"
and elements are modified.
+
+ Each command follows this format: (element_class, type, quantity_change: int, old_data, new_data).
+
+ Available APIs
+
+ {{api_docs}}
+
+ Steps
+ 1. Quantity Adjustment:
+ - If quantity_change = 0, modify the content only.
+ - If quantity_change > 0, use clone_paragraph to add the specified number of paragraphs from the same element_class. The paragraph_id for newly cloned paragraphs should be the current maximum paragraph_id of the parent element plus 1.
+ - If quantity_change < 0, use del_paragraph or del_image to remove the specified number of tail elements.
+ - Each command’s API call group must exclusively use either clone_paragraph or del_paragraph/del_image based on the `quantity_change`
+ 2. Content Modification:
+ - Text Content: Use replace_paragraph to modify the content.
+ - Image Content: Use replace_image to replace image resources.
+ 3. Output Format:
+ - Add comments to each API call group, explaining the intent of the original command and the associated element_class.
+ - For cloning operations, annotate the paragraph_id of the newly created paragraphs.
+
+ Example Input:
+
+
+ WorldFAIR: Global cooperation on FAIR data policy and practice +
+` element or the `alt` attribute of an `` element. You must use English.
+template: |
+ Please analyze the slide elements and create a structured slide schema in JSON format. You should:
+
+ 1. Understand the html representation of the slide, especially the style, layout, and the logical relationship between elements
+
+ 2. For each element, extract the following information:
+ - "name": The role and feature of the element, such as "section/main/sub title", "content bullets/paragraph", "portrait/landscape/square image", "presenters", "dates", "acknowledgments", etc.
+ - "description": A clear description of the element's purpose, style feature(e.g. red bold bullets, small circular images), do not mention any content detail
+ - "type": Literal["text", "image"]
+ - "data": List[str]
+ * For text elements: The content of each paragraph, defined as text within `
` tags, must be treated as a single, distinct item. + - Preserve newlines (`\n`) within paragraphs, ensuring they remain intact and are not split into multiple items. Only `
` tags serve as separators. + - Do not combine multiple `
` tags into a single item, regardless of logical or narrative connections. Each `
` tag represents a separate item.
+ * For image elements: Use the `alt` attribute of the tag as the data of the image
+
+ 3. Do not include any empty elements, only given elements should be included
+
+ Example Input:
+
text0
+text1\ntext2\ntext3
+
+
+ Example Output:
+ {
+ "main title": {
+ "description": "main title of the slide",
+ "type": "text",
+ "data": ["text0"]
+ },
+ "content bullets": {
+ "description": "content bullets of the slide",
+ "type": "text",
+ "data": ["text1\ntext2\ntext3"]
+ },
+ "main image": {
+ "description": "main image of the slide",
+ "type": "image",
+ "data": ["caption of image"]
+ }
+ }
+ Example format:
+ {
+ "element_name": {
+ "description": "purpose of this element", # do not mention any detail, just purpose
+ "type": "text" or "image",
+ "data": ["text1", "text2"] or ["logo:...", "logo:..."]
+ }
+ }
+ Input:
+ {{slide}}
+
+ Output: Please provide the slide schema in a dict of JSON format
+
+jinja_args:
+ - slide
+
+use_model: language
+return_json: True
diff --git a/pptagent/runs/2025-07-05/165c18a8-8169-480d-a388-615ab6f420b0/task.json b/pptagent/runs/2025-07-05/165c18a8-8169-480d-a388-615ab6f420b0/task.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b75c02249a3cb62e833b05003738f5099889347
--- /dev/null
+++ b/pptagent/runs/2025-07-05/165c18a8-8169-480d-a388-615ab6f420b0/task.json
@@ -0,0 +1 @@
+{"numberOfPages": 5, "pptx": "c1eb4d337b2aa71bec0b0bda89322db2", "pdf": "37fd83b93256101767cb27322fba795f"}
\ No newline at end of file
diff --git a/pptagent/runs/2025-07-05/170a771a-eb46-4f03-bba2-c77dae7dc110/final.pptx b/pptagent/runs/2025-07-05/170a771a-eb46-4f03-bba2-c77dae7dc110/final.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..9482af8a2cba66312d26a07f871d416b19d55174
Binary files /dev/null and b/pptagent/runs/2025-07-05/170a771a-eb46-4f03-bba2-c77dae7dc110/final.pptx differ
diff --git a/pptagent/runs/2025-07-05/170a771a-eb46-4f03-bba2-c77dae7dc110/task.json b/pptagent/runs/2025-07-05/170a771a-eb46-4f03-bba2-c77dae7dc110/task.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b75c02249a3cb62e833b05003738f5099889347
--- /dev/null
+++ b/pptagent/runs/2025-07-05/170a771a-eb46-4f03-bba2-c77dae7dc110/task.json
@@ -0,0 +1 @@
+{"numberOfPages": 5, "pptx": "c1eb4d337b2aa71bec0b0bda89322db2", "pdf": "37fd83b93256101767cb27322fba795f"}
\ No newline at end of file
diff --git a/pptagent/runs/2025-07-05/5284ef26-24fd-43c2-b435-5abad5de0cf8/task.json b/pptagent/runs/2025-07-05/5284ef26-24fd-43c2-b435-5abad5de0cf8/task.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9520f954e0142a748a3a0b367fe1f0fbd73783d
--- /dev/null
+++ b/pptagent/runs/2025-07-05/5284ef26-24fd-43c2-b435-5abad5de0cf8/task.json
@@ -0,0 +1 @@
+{"numberOfPages": 3, "pptx": "0210ff6b414902fa05857e734dd5bcee", "pdf": "9145dbfce1296e2b0603293042aa883e"}
\ No newline at end of file
diff --git a/pptagent/runs/2025-07-05/a5c87f48-ac8c-4577-9a99-e5ba2e268bfc/task.json b/pptagent/runs/2025-07-05/a5c87f48-ac8c-4577-9a99-e5ba2e268bfc/task.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8ab43227c50540420208cfe18312696b4098c04
--- /dev/null
+++ b/pptagent/runs/2025-07-05/a5c87f48-ac8c-4577-9a99-e5ba2e268bfc/task.json
@@ -0,0 +1 @@
+{"numberOfPages": 4, "pptx": "c1eb4d337b2aa71bec0b0bda89322db2", "pdf": "37fd83b93256101767cb27322fba795f"}
\ No newline at end of file
diff --git a/pptagent/runs/2025-07-05/b43539a3-11a0-42bd-8f7e-ca33253615a1/task.json b/pptagent/runs/2025-07-05/b43539a3-11a0-42bd-8f7e-ca33253615a1/task.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8ab43227c50540420208cfe18312696b4098c04
--- /dev/null
+++ b/pptagent/runs/2025-07-05/b43539a3-11a0-42bd-8f7e-ca33253615a1/task.json
@@ -0,0 +1 @@
+{"numberOfPages": 4, "pptx": "c1eb4d337b2aa71bec0b0bda89322db2", "pdf": "37fd83b93256101767cb27322fba795f"}
\ No newline at end of file
diff --git a/pptagent/runs/2025-07-05/da363260-400b-4a00-b426-0282f3069046/final.pptx b/pptagent/runs/2025-07-05/da363260-400b-4a00-b426-0282f3069046/final.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..d363ad7656b7bd0c87f202adf28763fb6772d8da
Binary files /dev/null and b/pptagent/runs/2025-07-05/da363260-400b-4a00-b426-0282f3069046/final.pptx differ
diff --git a/pptagent/runs/2025-07-05/da363260-400b-4a00-b426-0282f3069046/task.json b/pptagent/runs/2025-07-05/da363260-400b-4a00-b426-0282f3069046/task.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b75c02249a3cb62e833b05003738f5099889347
--- /dev/null
+++ b/pptagent/runs/2025-07-05/da363260-400b-4a00-b426-0282f3069046/task.json
@@ -0,0 +1 @@
+{"numberOfPages": 5, "pptx": "c1eb4d337b2aa71bec0b0bda89322db2", "pdf": "37fd83b93256101767cb27322fba795f"}
\ No newline at end of file
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_27.jpeg b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_27.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..c0634c7a006e30b50a6461037ce77313de502e60
Binary files /dev/null and b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_27.jpeg differ
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_34.jpeg b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_34.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..8148c7fa84fc604bf8ed33a29dbba0db60e943fb
Binary files /dev/null and b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_34.jpeg differ
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_42.jpeg b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_42.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..21a880ea5935b16630192986ad78d24b1612fe1a
Binary files /dev/null and b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_42.jpeg differ
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_66.jpeg b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_66.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..ad1698636602d6222a8cf4eb210ad1f2f551261f
Binary files /dev/null and b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_66.jpeg differ
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_74.jpeg b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_74.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..eb3b15ab2894b1494d615ca8650e3e19d3b5b005
Binary files /dev/null and b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_74.jpeg differ
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_85.jpeg b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_85.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5ca14e9f7b0eabfc3cfaadb23315df39ebfd48d2
Binary files /dev/null and b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_85.jpeg differ
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Picture_2.jpeg b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Picture_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..c789b1c562ca98e1352eafbac9f9db677e106b7e
Binary files /dev/null and b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Picture_2.jpeg differ
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_1_Picture_52.jpeg b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_1_Picture_52.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..548b94a4c322997566b552c745db4d1cf7d0ab66
Binary files /dev/null and b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_1_Picture_52.jpeg differ
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/meta.json b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f44434f7cdd689321edb60651a142cf159b2f5e
--- /dev/null
+++ b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/meta.json
@@ -0,0 +1,693 @@
+{
+ "table_of_contents": [
+ {
+ "title": "Building effective agents",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 515.60107421875,
+ 129.55579376220703
+ ],
+ [
+ 1456.3255615234375,
+ 129.55579376220703
+ ],
+ [
+ 1456.3255615234375,
+ 216.3658676147461
+ ],
+ [
+ 515.60107421875,
+ 216.3658676147461
+ ]
+ ]
+ },
+ {
+ "title": "What are agents?",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 623.978759765625,
+ 960.8446655273438
+ ],
+ [
+ 956.5999755859375,
+ 960.8446655273438
+ ],
+ [
+ 956.5999755859375,
+ 997.7664184570312
+ ],
+ [
+ 623.978759765625,
+ 997.7664184570312
+ ]
+ ]
+ },
+ {
+ "title": "When (and when not) to use\nagents",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 617.9833984375,
+ 1626.3153076171875
+ ],
+ [
+ 1136.0601806640625,
+ 1626.3153076171875
+ ],
+ [
+ 1136.0601806640625,
+ 1704.6533203125
+ ],
+ [
+ 617.9833984375,
+ 1704.6533203125
+ ]
+ ]
+ },
+ {
+ "title": "When and how to use frameworks",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 620.75048828125,
+ 2129.1544189453125
+ ],
+ [
+ 1259.02587890625,
+ 2129.1544189453125
+ ],
+ [
+ 1259.02587890625,
+ 2171.3507080078125
+ ],
+ [
+ 620.75048828125,
+ 2171.3507080078125
+ ]
+ ]
+ },
+ {
+ "title": "Building blocks, workflows, and\nagents",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 615.21630859375,
+ 2918.6533203125
+ ],
+ [
+ 1202.76171875,
+ 2918.6533203125
+ ],
+ [
+ 1202.76171875,
+ 2997.6947021484375
+ ],
+ [
+ 615.21630859375,
+ 2997.6947021484375
+ ]
+ ]
+ },
+ {
+ "title": "Building block: The augmented LLM",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 618.444580078125,
+ 3215.7088623046875
+ ],
+ [
+ 1140.041015625,
+ 3215.7088623046875
+ ],
+ [
+ 1140.041015625,
+ 3250.8724365234375
+ ],
+ [
+ 618.444580078125,
+ 3250.8724365234375
+ ]
+ ]
+ },
+ {
+ "title": "The augmented LLM",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 464.5,
+ 3851.2904663085938
+ ],
+ [
+ 609.220947265625,
+ 3851.2904663085938
+ ],
+ [
+ 609.220947265625,
+ 3882.05859375
+ ],
+ [
+ 464.5,
+ 3882.05859375
+ ]
+ ]
+ },
+ {
+ "title": "Workflow: Prompt chaining",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 621.211669921875,
+ 4257.429748535156
+ ],
+ [
+ 997.9970703125,
+ 4257.429748535156
+ ],
+ [
+ 997.9970703125,
+ 4285.560607910156
+ ],
+ [
+ 621.211669921875,
+ 4285.560607910156
+ ]
+ ]
+ },
+ {
+ "title": "Workflow: Routing",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 624.5,
+ 5309.69970703125
+ ],
+ [
+ 878.0260009765625,
+ 5309.69970703125
+ ],
+ [
+ 878.0260009765625,
+ 5337.83056640625
+ ],
+ [
+ 624.5,
+ 5337.83056640625
+ ]
+ ]
+ },
+ {
+ "title": "Workflow: Parallelization",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 623.517578125,
+ 6389.80517578125
+ ],
+ [
+ 977.705078125,
+ 6389.80517578125
+ ],
+ [
+ 977.705078125,
+ 6422.6268310546875
+ ],
+ [
+ 623.517578125,
+ 6422.6268310546875
+ ]
+ ]
+ },
+ {
+ "title": "Sectioning:",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 640.581298828125,
+ 7428.964370727539
+ ],
+ [
+ 786.314697265625,
+ 7428.964370727539
+ ],
+ [
+ 786.314697265625,
+ 7457.974319458008
+ ],
+ [
+ 640.581298828125,
+ 7457.974319458008
+ ]
+ ]
+ },
+ {
+ "title": "Workflow: Orchestrator-workers",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 622.59521484375,
+ 7972.80517578125
+ ],
+ [
+ 1076.2110595703125,
+ 7972.80517578125
+ ],
+ [
+ 1076.2110595703125,
+ 8002.350402832031
+ ],
+ [
+ 622.59521484375,
+ 8002.350402832031
+ ]
+ ]
+ },
+ {
+ "title": "Workflow: Evaluator-optimizer",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 611.52685546875,
+ 9054.620361328125
+ ],
+ [
+ 1053.3388671875,
+ 9054.620361328125
+ ],
+ [
+ 1053.3388671875,
+ 9086.267578125
+ ],
+ [
+ 611.52685546875,
+ 9086.267578125
+ ]
+ ]
+ },
+ {
+ "title": "The evaluator-optimizer workflow",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 461.642822265625,
+ 9604.930297851562
+ ],
+ [
+ 712.525634765625,
+ 9604.930297851562
+ ],
+ [
+ 712.525634765625,
+ 9633.061157226562
+ ],
+ [
+ 461.642822265625,
+ 9633.061157226562
+ ]
+ ]
+ },
+ {
+ "title": "Examples where evaluator-optimizer is useful:",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 623.517578125,
+ 9940.742431640625
+ ],
+ [
+ 1072.70849609375,
+ 9940.742431640625
+ ],
+ [
+ 1072.70849609375,
+ 9972.3896484375
+ ],
+ [
+ 623.517578125,
+ 9972.3896484375
+ ]
+ ]
+ },
+ {
+ "title": "Agents",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 614.755126953125,
+ 10197.4365234375
+ ],
+ [
+ 722.4810180664062,
+ 10197.4365234375
+ ],
+ [
+ 722.4810180664062,
+ 10226.80517578125
+ ],
+ [
+ 614.755126953125,
+ 10226.80517578125
+ ]
+ ]
+ },
+ {
+ "title": "Combining and customizing these\npatterns",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 623.517578125,
+ 12692.47412109375
+ ],
+ [
+ 1251.64697265625,
+ 12692.47412109375
+ ],
+ [
+ 1251.64697265625,
+ 12780.309814453125
+ ],
+ [
+ 623.517578125,
+ 12780.309814453125
+ ]
+ ]
+ },
+ {
+ "title": "Summary",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 615.21630859375,
+ 13029.76318359375
+ ],
+ [
+ 808.6976318359375,
+ 13029.76318359375
+ ],
+ [
+ 808.6976318359375,
+ 13068.410888671875
+ ],
+ [
+ 615.21630859375,
+ 13068.410888671875
+ ]
+ ]
+ },
+ {
+ "title": "Acknowledgements",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 618.90576171875,
+ 13679.747314453125
+ ],
+ [
+ 892.84765625,
+ 13679.747314453125
+ ],
+ [
+ 892.84765625,
+ 13707.854736328125
+ ],
+ [
+ 618.90576171875,
+ 13707.854736328125
+ ]
+ ]
+ },
+ {
+ "title": "Appendix 1: Agents in practice",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 620.75048828125,
+ 13875.653076171875
+ ],
+ [
+ 1182.4697265625,
+ 13875.653076171875
+ ],
+ [
+ 1182.4697265625,
+ 13911.653106689453
+ ],
+ [
+ 620.75048828125,
+ 13911.653106689453
+ ]
+ ]
+ },
+ {
+ "title": "A. Customer support",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 616.138671875,
+ 14162.843627929688
+ ],
+ [
+ 909.199951171875,
+ 14162.843627929688
+ ],
+ [
+ 909.199951171875,
+ 14194.464477539062
+ ],
+ [
+ 616.138671875,
+ 14194.464477539062
+ ]
+ ]
+ },
+ {
+ "title": "B. Coding agents",
+ "heading_level": null,
+ "page_id": 1,
+ "polygon": [
+ [
+ 620.75048828125,
+ 286.9424473044454
+ ],
+ [
+ 856.87548828125,
+ 286.9424473044454
+ ],
+ [
+ 856.87548828125,
+ 316.3837091258816
+ ],
+ [
+ 620.75048828125,
+ 316.3837091258816
+ ]
+ ]
+ },
+ {
+ "title": "Appendix 2: Prompt engineering\nyour tools",
+ "heading_level": null,
+ "page_id": 1,
+ "polygon": [
+ [
+ 624.5,
+ 824.3553310002137
+ ],
+ [
+ 1221.208984375,
+ 824.3553310002137
+ ],
+ [
+ 1221.208984375,
+ 902.653076171875
+ ],
+ [
+ 624.5,
+ 902.653076171875
+ ]
+ ]
+ },
+ {
+ "title": "",
+ "heading_level": null,
+ "page_id": 1,
+ "polygon": [
+ [
+ 26.791770935058594,
+ 2699.918776865774
+ ],
+ [
+ 81.629150390625,
+ 2699.918776865774
+ ],
+ [
+ 81.629150390625,
+ 2728.3545980826484
+ ],
+ [
+ 26.791770935058594,
+ 2728.3545980826484
+ ]
+ ]
+ },
+ {
+ "title": "Product",
+ "heading_level": null,
+ "page_id": 1,
+ "polygon": [
+ [
+ 295.617431640625,
+ 2695.9569091796875
+ ],
+ [
+ 383.241943359375,
+ 2695.9569091796875
+ ],
+ [
+ 383.241943359375,
+ 2720.6498857772513
+ ],
+ [
+ 295.617431640625,
+ 2720.6498857772513
+ ]
+ ]
+ },
+ {
+ "title": "API Platform",
+ "heading_level": null,
+ "page_id": 1,
+ "polygon": [
+ [
+ 317.6874694824219,
+ 3033.876074500511
+ ],
+ [
+ 433.971923828125,
+ 3033.876074500511
+ ],
+ [
+ 433.971923828125,
+ 3054.026860530011
+ ],
+ [
+ 317.6874694824219,
+ 3054.026860530011
+ ]
+ ]
+ }
+ ],
+ "page_stats": [
+ {
+ "page_id": 0,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 696
+ ],
+ [
+ "Line",
+ 268
+ ],
+ [
+ "Text",
+ 49
+ ],
+ [
+ "ListItem",
+ 31
+ ],
+ [
+ "SectionHeader",
+ 21
+ ],
+ [
+ "ListGroup",
+ 11
+ ],
+ [
+ "Figure",
+ 6
+ ],
+ [
+ "Form",
+ 3
+ ],
+ [
+ "PageHeader",
+ 2
+ ],
+ [
+ "Picture",
+ 1
+ ],
+ [
+ "PageFooter",
+ 1
+ ],
+ [
+ "Caption",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 1,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 304
+ ],
+ [
+ "Line",
+ 136
+ ],
+ [
+ "Text",
+ 36
+ ],
+ [
+ "ListItem",
+ 15
+ ],
+ [
+ "SectionHeader",
+ 5
+ ],
+ [
+ "ListGroup",
+ 4
+ ],
+ [
+ "Picture",
+ 1
+ ]
+ ]
+ }
+ ],
+ "debug_data_path": "debug_data/source"
+}
\ No newline at end of file
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/refined_doc.json b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/refined_doc.json
new file mode 100644
index 0000000000000000000000000000000000000000..c000871814e4360633849e6bde9e6f8bf5430dec
--- /dev/null
+++ b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/refined_doc.json
@@ -0,0 +1,295 @@
+{
+ "image_dir": "/Users/shijingwei/Desktop/PresentAgent/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f",
+ "sections": [
+ {
+ "title": "Building effective agents",
+ "summary": "Appendix 2 emphasizes the importance of prompt engineering when defining tools for agentic systems like Claude, which utilize external services and APIs. It recommends considering multiple action specifications, ensuring accessibility, and minimizing formatting overhead to facilitate model understanding. Key strategies include clear tool definitions, including examples, refining parameter descriptions for clarity, testing model interactions with tools, and implementing design changes to reduce errors. The section asserts that optimizing tool specifications can significantly influence the agent's performance, often requiring more attention than overall prompt design.",
+ "subsections": [
+ {
+ "title": "What are agents?",
+ "content": "\"Agent\" can be defined in several ways. Some customers define agents as fully autonomous systems that operate independently over extended periods, using various tools to accomplish complex tasks. Others use the term to describe more prescriptive implementations that follow predefined workflows. At Anthropic, we categorize all these variations as agentic systems, but draw an important architectural distinction between workflows and agents: Workflows are systems where LLMs and tools are orchestrated through predefined code paths. Agents, on the other hand, are systems where LLMs dynamically direct their own processes and tool usage, maintaining control over how they accomplish tasks. Below, we will explore both types of agentic systems in detail. In Appendix 1 (\"Agents in Practice\"), we describe two domains where customers have found particular value in using these kinds of systems.",
+ "medias": []
+ },
+ {
+ "title": "When (and when not) to use agents",
+ "content": "When building applications with LLMs, we recommend finding the simplest solution possible, and only increasing complexity when needed. This might mean not building agentic systems at all. Agentic systems often trade latency and cost for better task performance, and you should consider when this tradeoff makes sense. When more complexity is warranted, workflows offer predictability and consistency for well-defined tasks, whereas agents are the better option when flexibility and model-driven decision-making are needed at scale. For many applications, however, optimizing single LLM calls with retrieval and in-context examples is usually enough.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "When and how to use frameworks",
+ "summary": "Appendix 2 emphasizes the importance of prompt engineering when defining tools for agentic systems like Claude, which utilize external services and APIs. It recommends considering multiple action specifications, ensuring accessibility, and minimizing formatting overhead to facilitate model understanding. Key strategies include clear tool definitions, including examples, refining parameter descriptions for clarity, testing model interactions with tools, and implementing design changes to reduce errors. The section asserts that optimizing tool specifications can significantly influence the agent's performance, often requiring more attention than overall prompt design.",
+ "subsections": [
+ {
+ "title": "Frameworks Overview",
+ "content": "There are many frameworks that make agentic systems easier to implement, including: LangGraph from LangChain; Amazon Bedrock's AI Agent framework; Rivet, a drag and drop GUI LLM workflow builder; and Vellum, another GUI tool for building and testing complex workflows. These frameworks make it easy to get started by simplifying standard low-level tasks like calling LLMs, defining and parsing tools, and chaining calls together. However, they often create extra layers of abstraction that can obscure the underlying prompts and responses, making them harder to debug. They can also make it tempting to add complexity when a simpler setup would suffice. We suggest that developers start by using LLM APIs directly: many patterns can be implemented in a few lines of code. If you do use a framework, ensure you understand the underlying code. Incorrect assumptions about what's under the hood are a common source of customer error. See our cookbook for some sample implementations.",
+ "medias": [
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "The basic building block of agentic systems is an LLM enhanced with augmentations such as retrieval, tools, and memory. Our current models can actively use these capabilities—generating their own search queries, selecting appropriate tools, and determining what information to retain.\n\n",
+ "#### The augmented LLM\n\nThe prompt chaining workflow\n\nWe recommend focusing on two key aspects of the implementation: tailoring these capabilities to your specific use case and ensuring they provide an easy, well-documented interface for your LLM. While there are many ways to implement these augmentations, one approach is through our recently released Model Context Protocol, which allows developers to integrate with a growing ecosystem of third-party tools with a simple client implementation.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_27.jpeg",
+ "caption": "Diagram: Flowchart illustrating the interactions in an augmented LLM system. It shows input and output paths along with components like Retrieval, Tools, and Memory, detailing how the system processes queries and responses."
+ },
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "Prompt chaining decomposes a task into a sequence of steps, where each LLM call processes the output of the previous one. You can add programmatic checks (see \"gate\" in the diagram below) on any intermediate steps to ensure that the process is still on track.\n\n",
+ "When to use this workflow: This workflow is ideal for situations where the task can be easily and cleanly decomposed into fixed subtasks. The main goal is to trade off latency for higher accuracy, by making each LLM call an easier task.\n\nExamples where prompt chaining is useful:\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_34.jpeg",
+ "caption": "Diagram: Flowchart illustrating a prompt chaining workflow. It shows the sequence of steps from input through several LLM calls, with a gate determining whether to pass to the next step or exit."
+ },
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "Routing classifies an input and directs it to a specialized followup task. This workflow allows for separation of concerns, and building more specialized prompts. Without this workflow, optimizing for one kind of input can hurt performance on other inputs.\n\n",
+ "The routing workflow\n\nWhen to use this workflow: Routing works well for complex tasks where there are distinct categories that are better handled separately, and where classification can be handled accurately, either by an LLM or a more traditional classification model/algorithm.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_42.jpeg",
+ "caption": "Diagram: Flowchart illustrating the routing process for LLM calls. Shows an input leading to an LLM call router, which directs to multiple LLM calls and outputs a result, highlighting the separation of tasks in complex workflows."
+ },
+ {
+ "markdown_content": "| 7 | LLM Call 1 | 7 | | |\n| --- | --- | --- | --- | --- |\n| > In | LLM Call 2 | > | Aggregator | Out 1 |\n| 1 | LLM Call 3 | 기 | | |",
+ "near_chunks": [
+ "- Sectioning: Breaking a task into independent subtasks run in parallel.\n- Voting: Running the same task multiple times to get diverse outputs.\n\nLLMs can sometimes work simultaneously on a task and have their outputs aggregated programmatically. This workflow, parallelization, manifests in two key variations:\n\n",
+ "The parallelization workflow\n\ndivided subtasks can be parallelized for speed, or when multiple perspectives or attempts are needed for higher confidence results. For complex tasks with multiple considerations, LLMs generally perform better when each consideration is handled by a separate LLM call, allowing focused attention on each specific aspect.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/table_9f02.png",
+ "caption": "Table: This table outlines different LLM calls and their outputs, illustrating a parallelization workflow where tasks are divided and processed separately to enhance efficiency and output diversity.",
+ "cells": [
+ [
+ "7",
+ "LLM Call 1",
+ "7",
+ "",
+ ""
+ ],
+ [
+ "> In",
+ "LLM Call 2",
+ ">",
+ "Aggregator",
+ "Out 1"
+ ],
+ [
+ "1",
+ "LLM Call 3",
+ "기",
+ "",
+ ""
+ ]
+ ],
+ "merge_area": null
+ },
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "In the orchestrator-workers workflow, a central LLM dynamically breaks down tasks, delegates them to worker LLMs, and synthesizes their results.\n\n#### Workflow: Orchestrator-workers\n\n- Implementing guardrails where one model instance processes user queries while another screens them for inappropriate content or requests. This tends to perform better than having the same LLM call handle both guardrails and the core response.\n- Automating evals for evaluating LLM performance, where each LLM call evaluates a different aspect of the model's performance on a given prompt.\n- Voting:\n- Reviewing a piece of code for vulnerabilities, where several different prompts review and flag the code if they find a problem.\n- Evaluating whether a given piece of content is inappropriate, with multiple prompts evaluating different aspects or requiring different vote thresholds to balance false positives and negatives.\n\n",
+ "The orchestrator-workers workflow\n\nWhen to use this workflow: This workflow is well-suited for complex tasks where you can't predict the subtasks needed (in coding, for example, the number of files that need to be changed and the nature of the change in each file likely depend on the task). Whereas it's topographically similar, the key difference from parallelization is its flexibility—subtasks aren't pre-defined, but determined by the orchestrator based on the specific input.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_66.jpeg",
+ "caption": "Diagram: Workflow illustration showing the process flow of an orchestrator managing multiple LLM calls. Inputs are directed to the orchestrator, which delegates tasks to LLM Call 1, Call 2, and Call 3, before synthesizing results and producing an output."
+ },
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "In the evaluator-optimizer workflow, one LLM call generates a response while another provides evaluation and feedback in a loop.\n\n#### Workflow: Evaluator-optimizer\n\n- Coding products that make complex changes to multiple files each time.\n- Search tasks that involve gathering and analyzing information from multiple sources for possible relevant information.\n\n",
+ "#### The evaluator-optimizer workflow\n\nWhen to use this workflow: This workflow is particularly effective when we have clear evaluation criteria, and when iterative\n\nrefinement provides measurable value. The two signs of good fit are, first, that LLM responses can be demonstrably improved when a human articulates their feedback; and second, that the LLM can provide such feedback. This is analogous to the iterative writing process a human writer might go through when producing a polished document.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_74.jpeg",
+ "caption": "Diagram: Flowchart illustrating the evaluator-optimizer workflow. It shows input leading to an LLM Call Generator, which outputs a solution. An LLM Call Evaluator processes the solution, providing feedback, leading to either acceptance or rejection."
+ },
+ {
+ "markdown_content": "| (\"Prompt Engineering your Tools\"). |\n| --- |",
+ "near_chunks": [
+ "Agents can handle sophisticated tasks, but their implementation is often straightforward. They are typically just LLMs using tools based on environmental feedback in a loop. It is therefore crucial to design toolsets and their documentation clearly and thoughtfully. We expand on best practices for tool development in Appendix 2\n\n",
+ "When to use agents: Agents can be used for open-ended problems where it's difficult or impossible to predict the required number of steps, and where you can't hardcode a fixed path. The LLM will potentially operate for many turns, and you must have some level of trust in its decision-making. Agents' autonomy makes them ideal for scaling tasks in trusted environments.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/table_ef93.png",
+ "caption": "Table: Best practices and considerations for prompt engineering tools in the context of developing agents that utilize large language models (LLMs) for effective and reliable task execution.",
+ "cells": [
+ [
+ "(\"Prompt Engineering your Tools\")."
+ ],
+ [
+ ""
+ ]
+ ],
+ "merge_area": null
+ },
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "Agents can handle sophisticated tasks, but their implementation is often straightforward. They are typically just LLMs using tools based on environmental feedback in a loop. It is therefore crucial to design toolsets and their documentation clearly and thoughtfully. We expand on best practices for tool development in Appendix 2\n\n",
+ "When to use agents: Agents can be used for open-ended problems where it's difficult or impossible to predict the required number of steps, and where you can't hardcode a fixed path. The LLM will potentially operate for many turns, and you must have some level of trust in its decision-making. Agents' autonomy makes them ideal for scaling tasks in trusted environments.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_0_Figure_85.jpeg",
+ "caption": "Diagram: Flowchart illustrating the interaction between a human, an LLM call, and the environment. It depicts a feedback loop with an action path, indicating the process of feedback and stopping mechanisms."
+ }
+ ]
+ },
+ {
+ "title": "Building blocks, workflows, and agents",
+ "content": "In this section, we'll explore the common patterns for agentic systems we've seen in production. We'll start with our foundational building block—the augmented LLM—and progressively increase complexity, from simple compositional workflows to autonomous agents.",
+ "medias": []
+ },
+ {
+ "title": "Building block: The augmented LLM",
+ "content": "The basic building block of agentic systems is an LLM enhanced with augmentations such as retrieval, tools, and memory. Our current models can actively use these capabilities—generating their own search queries, selecting appropriate tools, and determining what information to retain.",
+ "medias": []
+ },
+ {
+ "title": "The prompt chaining workflow",
+ "content": "We recommend focusing on two key aspects of the implementation: tailoring these capabilities to your specific use case and ensuring they provide an easy, well-documented interface for your LLM. While there are many ways to implement these augmentations, one approach is through our recently released Model Context Protocol, which allows developers to integrate with a growing ecosystem of third-party tools with a simple client implementation. For the remainder of this post, we'll assume each LLM call has access to these augmented capabilities.",
+ "medias": []
+ },
+ {
+ "title": "Workflow: Prompt chaining",
+ "content": "Prompt chaining decomposes a task into a sequence of steps, where each LLM call processes the output of the previous one. You can add programmatic checks (see 'gate' in the diagram below) on any intermediate steps to ensure that the process is still on track. When to use this workflow: This workflow is ideal for situations where the task can be easily and cleanly decomposed into fixed subtasks. The main goal is to trade off latency for higher accuracy, by making each LLM call an easier task. Examples where prompt chaining is useful: Generating Marketing copy, then translating it into a different language. Writing an outline of a document, checking that the outline meets certain criteria, then writing the document based on the outline.",
+ "medias": []
+ },
+ {
+ "title": "Workflow: Routing",
+ "content": "Routing classifies an input and directs it to a specialized followup task. This workflow allows for separation of concerns, and building more specialized prompts. Without this workflow, optimizing for one kind of input can hurt performance on other inputs. When to use this workflow: Routing works well for complex tasks where there are distinct categories that are better handled separately, and where classification can be handled accurately, either by an LLM or a more traditional classification model/algorithm. Examples where routing is useful: Directing different types of customer service queries (general questions, refund requests, technical support) into different downstream processes, prompts, and tools. Routing easy/common questions to smaller models like Claude 3.5 Haiku and hard/unusual questions to more capable models like Claude 3.5 Sonnet to optimize cost and speed.",
+ "medias": []
+ },
+ {
+ "title": "Workflow: Parallelization",
+ "content": "LLMs can sometimes work simultaneously on a task and have their outputs aggregated programmatically. This workflow, parallelization, manifests in two key variations: Sectioning: Breaking a task into independent subtasks run in parallel and Voting: Running the same task multiple times to get diverse outputs. Divided subtasks can be parallelized for speed, or when multiple perspectives or attempts are needed for higher confidence results. For complex tasks with multiple considerations, LLMs generally perform better when each consideration is handled by a separate LLM call, allowing focused attention on each specific aspect. Examples where parallelization is useful: Implementing guardrails where one model instance processes user queries while another screens them for inappropriate content or requests. Automating evals for evaluating LLM performance, where each LLM call evaluates a different aspect of the model's performance on a given prompt.",
+ "medias": []
+ },
+ {
+ "title": "Workflow: Orchestrator-workers",
+ "content": "In the orchestrator-workers workflow, a central LLM dynamically breaks down tasks, delegates them to worker LLMs, and synthesizes their results. When to use this workflow: This workflow is well-suited for complex tasks where you can't predict the subtasks needed. Whereas it's topographically similar, the key difference from parallelization is its flexibility—subtasks aren't pre-defined, but determined by the orchestrator based on the specific input. Example where orchestrator-workers is useful: Coding products that make complex changes to multiple files each time. Search tasks that involve gathering and analyzing information from multiple sources for possible relevant information.",
+ "medias": []
+ },
+ {
+ "title": "Workflow: Evaluator-optimizer",
+ "content": "In the evaluator-optimizer workflow, one LLM call generates a response while another provides evaluation and feedback in a loop. When to use this workflow: This workflow is particularly effective when we have clear evaluation criteria, and when iterative refinement provides measurable value. The two signs of good fit are, first, that LLM responses can be demonstrably improved when a human articulates their feedback; and second, that the LLM can provide such feedback. This is analogous to the iterative writing process a human writer might go through when producing a polished document.",
+ "medias": []
+ },
+ {
+ "title": "Agents",
+ "content": "Agents are emerging in production as LLMs mature in key capabilities—understanding complex inputs, engaging in reasoning and planning, using tools reliably, and recovering from errors. Agents begin their work with either a command from, or interactive discussion with, the human user. Once the task is clear, agents plan and operate independently, potentially returning to the human for further information or judgement. During execution, it's crucial for the agents to gain 'ground truth' from the environment at each step (such as tool call results or code execution) to assess its progress. Agents can then pause for human feedback at checkpoints or when encountering blockers. The task often terminates upon completion, but it's also common to include stopping conditions (such as a maximum number of iterations) to maintain control. Examples where agents are useful: A coding Agent to resolve SWE-bench tasks, which involve edits to many files based on a task description; Our 'computer use' reference implementation, where Claude uses a computer to accomplish tasks.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Combining and customizing these patterns",
+ "summary": "Appendix 2 emphasizes the importance of prompt engineering when defining tools for agentic systems like Claude, which utilize external services and APIs. It recommends considering multiple action specifications, ensuring accessibility, and minimizing formatting overhead to facilitate model understanding. Key strategies include clear tool definitions, including examples, refining parameter descriptions for clarity, testing model interactions with tools, and implementing design changes to reduce errors. The section asserts that optimizing tool specifications can significantly influence the agent's performance, often requiring more attention than overall prompt design.",
+ "subsections": [
+ {
+ "title": "Summary",
+ "content": "Success in the LLM space isn't about building the most sophisticated system. It's about building the *right* system for your needs. Start with simple prompts, optimize them with comprehensive evaluation, and add multi-step agentic systems only when simpler solutions fall short.\n\nWhen implementing agents, we try to follow three core principles:\n\n- Maintain simplicity in your agent's design. 2. Prioritize transparency by explicitly showing the agent's planning steps.\n- Carefully craft your agent-computer interface (ACI) through thorough tool documentation and testing.\n\nFrameworks can help you get started quickly, but don't hesitate to reduce abstraction layers and build with basic components as you move to production. By following these principles, you can create agents that are not only powerful but also reliable, maintainable, and trusted by their users.",
+ "medias": []
+ },
+ {
+ "title": "Acknowledgements",
+ "content": "Written by Erik Schluntz and Barry Zhang. This work draws upon our experiences building agents at Anthropic and the valuable insights shared by our customers, for which we're deeply grateful.",
+ "medias": []
+ },
+ {
+ "title": "Appendix 1: Agents in practice",
+ "content": "Our work with customers has revealed two particularly promising applications for AI agents that demonstrate the practical value of the patterns discussed above. Both applications illustrate how agents add the most value for tasks that require both conversation and action, have clear success criteria, enable feedback loops, and integrate meaningful human oversight.",
+ "medias": []
+ },
+ {
+ "title": "A. Customer support",
+ "content": "Customer support combines familiar chatbot interfaces with enhanced capabilities through tool integration. This is a natural fit for more open-ended agents because:\n\n- Support interactions naturally follow a conversation flow while requiring access to external information and actions; Tools can be integrated to pull customer data, order history, and knowledge base articles; Actions such as issuing refunds or updating tickets can be handled programmatically; and Success can be clearly measured through user-defined resolutions.\n\nSeveral companies have demonstrated the viability of this approach through usage-based pricing models that charge only for successful resolutions, showing confidence in their agents' effectiveness.",
+ "medias": []
+ },
+ {
+ "title": "B. Coding agents",
+ "content": "The software development space has shown remarkable potential for LLM features, with capabilities evolving from code completion to autonomous problem-solving. Agents are particularly effective because:\n\n- Code solutions are verifiable through automated tests;\n- Agents can iterate on solutions using test results as feedback;\n- The problem space is well-defined and structured; and\n- Output quality can be measured objectively.\n\nIn our own implementation, agents can now solve real GitHub issues in the SWE-bench Verified benchmark based on the pull request description alone. However, whereas automated testing helps verify functionality, human review remains crucial for ensuring solutions align with broader system requirements.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Appendix 2: Prompt engineering your tools",
+ "summary": "Appendix 2 emphasizes the importance of prompt engineering when defining tools for agentic systems like Claude, which utilize external services and APIs. It recommends considering multiple action specifications, ensuring accessibility, and minimizing formatting overhead to facilitate model understanding. Key strategies include clear tool definitions, including examples, refining parameter descriptions for clarity, testing model interactions with tools, and implementing design changes to reduce errors. The section asserts that optimizing tool specifications can significantly influence the agent's performance, often requiring more attention than overall prompt design.",
+ "subsections": [
+ {
+ "title": "Introduction to Tools",
+ "content": "No matter which agentic system you're building, tools will likely be an important part of your agent. Tools enable Claude to interact with external services and APIs by specifying their exact structure and definition in our API. When Claude responds, it will include a tool use block in the API response if it plans to invoke a tool. Tool definitions and specifications should be given just as much prompt engineering attention as your overall prompts. In this brief appendix, we describe how to prompt engineer your tools.",
+ "medias": [
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "© 2025 Anthropic PBC\n\nUsage policy\n\nTerms of service commercial\n\nTerms of service consumer\n\nResponsible disclosure policy\n\nPrivacy policy\n\nPrivacy choices\n\nTerms and policies\n\nSupport center\n\nHelp and security Status Availability\n\nStartups program\n\nEvents News\n\n",
+ ""
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/_page_1_Picture_52.jpeg",
+ "caption": "Icon: Three social media platform icons displayed in grayscale: YouTube, LinkedIn, and X. The icons are positioned side by side against a dark background."
+ }
+ ]
+ },
+ {
+ "title": "Guidelines for Tool Formats",
+ "content": "Our suggestions for deciding on tool formats are the following:\n\n- Give the model enough tokens to \"think\" before it writes itself into a corner.\n- Keep the format close to what the model has seen naturally occurring in text on the internet.\n- Make sure there's no formatting \"overhead\" such as having to keep an accurate count of thousands of lines of code, or string escaping any code it writes.",
+ "medias": []
+ },
+ {
+ "title": "Human-Computer Interfaces",
+ "content": "One rule of thumb is to think about how much effort goes into human-computer interfaces (HCI), and plan to invest just as much effort in creating good *agent*-computer interfaces (ACI). Here are some thoughts on how to do so:\n\n- Put yourself in the model's shoes. Is it obvious how to use this tool, based on the description and parameters, or would you need to think carefully about it? If so, then it's probably also true for the model. A good tool definition often includes example usage, edge cases, input format requirements, and clear boundaries from other tools.\n- How can you change parameter names or descriptions to make things more obvious? Think of this as writing a great docstring for a junior developer on your team. This is especially important when using many similar tools.\n- Test how the model uses your tools: Run many example inputs in our workbench to see what mistakes the model makes, and iterate.\n- Poka-yoke your tools. Change the arguments so that it is harder to make mistakes.",
+ "medias": []
+ },
+ {
+ "title": "Tool Optimization",
+ "content": "While building our agent for SWE-bench, we actually spent more time optimizing our tools than the overall prompt. For example, we found that the model would make mistakes with tools using relative filepaths after the agent had moved out of the root directory. To fix this, we changed the tool to always require absolute filepaths—and we found that the model used this method flawlessly.",
+ "medias": []
+ },
+ {
+ "title": "Product Overview",
+ "content": "Claude overview Claude Code Claude team plan Claude enterprise plan Claude education plan Download Claude apps Claude.ai pricing plans Claude.ai login",
+ "medias": []
+ },
+ {
+ "title": "API Platform",
+ "content": "API overview Developer docs Claude in Amazon Bedrock Claude on Google Cloud's Vertex AI Pricing",
+ "medias": []
+ },
+ {
+ "title": "Console and Research",
+ "content": "Console login\n\nResearch\n\nResearch overview Economic Index\n\nClaude models Claude Opus 4 Claude Sonnet 4\n\nClaude Haiku 3.5\n\nCommitments Transparency Responsible scaling policy Security and compliance",
+ "medias": []
+ },
+ {
+ "title": "Solutions and Learn",
+ "content": "Solutions AI agents\n\nCoding\n\nCustomer support\n\nLearn\n\nAnthropic Academy Customer stories Engineering at Anthropic MCP Integrations",
+ "medias": []
+ },
+ {
+ "title": "Corporate Information",
+ "content": "Explore About us Become a partner Careers\n\nEvents News\n\nStartups program\n\nHelp and security Status Availability\n\nSupport center\n\nTerms and policies\n\nPrivacy choices\n\nPrivacy policy\n\nResponsible disclosure policy\n\nTerms of service consumer\n\nTerms of service commercial\n\nUsage policy",
+ "medias": []
+ },
+ {
+ "title": "Copyright Notice",
+ "content": "© 2025 Anthropic PBC\n\n",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ }
+ ],
+ "metadata": {
+ "title": "Building effective agents",
+ "publish_date": "Dec 19, 2024",
+ "authors": [
+ "Erik Schluntz",
+ "Barry Zhang"
+ ],
+ "organization": "Anthropic PBC",
+ "year": "2025",
+ "presentation-date": "2025-07-05"
+ }
+}
\ No newline at end of file
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/source.md b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/source.md
new file mode 100644
index 0000000000000000000000000000000000000000..51219594d1d14138bf8f1a7e6bb06a959dedad6e
--- /dev/null
+++ b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/source.md
@@ -0,0 +1,305 @@
+# Building effective agents
+
+
+
+Engineering at Anthropic
+
+Published Dec 19, 2024 We've worked with dozens of teams building LLM agents across industries. Consistently, the most successful implementations use simple, composable patterns rather than complex frameworks.
+
+> Over the past year, we've worked with dozens of teams building large language model (LLM) agents across industries. Consistently, the most successful implementations weren't using complex frameworks or specialized libraries. Instead, they were building with simple, composable patterns.
+
+In this post, we share what we've learned from working with our customers and building agents ourselves, and give practical advice for developers on building effective agents.
+
+#### What are agents?
+
+"Agent" can be defined in several ways. Some customers define agents as fully autonomous systems that operate independently over extended periods, using various tools to accomplish complex tasks. Others use the term to describe more prescriptive implementations that follow predefined workflows. At Anthropic, we categorize all these variations as agentic systems, but draw an important architectural distinction between workflows and agents:
+
+- Workflows are systems where LLMs and tools are orchestrated
+- through predefined code paths. Agents, on the other hand, are systems where LLMs dynamically direct their own processes and tool usage, maintaining control over how they accomplish tasks.
+
+Below, we will explore both types of agentic systems in detail. In Appendix 1 ("Agents in Practice"), we describe two domains where customers have found particular value in using these kinds of systems.
+
+### When (and when not) to use agents
+
+When building applications with LLMs, we recommend finding the simplest solution possible, and only increasing complexity when needed. This might mean not building agentic systems at all. Agentic systems often trade latency and cost for better task performance, and you should consider when this tradeoff makes sense.
+
+When more complexity is warranted, workflows offer predictability and consistency for well-defined tasks, whereas agents are the better option when flexibility and model-driven decision-making are needed at scale. For many applications, however, optimizing single LLM calls with retrieval and in-context examples is usually enough.
+
+## When and how to use frameworks
+
+There are many frameworks that make agentic systems easier to implement, including:
+
+- LangGraph from LangChain;
+- Amazon Bedrock's AI Agent framework;
+- Rivet, a drag and drop GUI LLM workflow builder; and
+- Vellum, another GUI tool for building and testing complex workflows.
+
+These frameworks make it easy to get started by simplifying standard low-level tasks like calling LLMs, defining and parsing tools, and chaining calls together. However, they often create extra layers of abstraction that can obscure the underlying prompts and responses, making them harder to debug. They can also make it tempting to add complexity when a simpler setup would suffice.
+
+We suggest that developers start by using LLM APIs directly: many patterns can be implemented in a few lines of code. If you do use a framework, ensure you understand the underlying code. Incorrect assumptions about what's under the hood are a common source of customer error.
+
+See our cookbook for some sample implementations.
+
+### Building blocks, workflows, and agents
+
+In this section, we'll explore the common patterns for agentic systems we've seen in production. We'll start with our foundational building block—the augmented LLM—and progressively increase complexity, from simple compositional workflows to autonomous agents.
+
+#### Building block: The augmented LLM
+
+The basic building block of agentic systems is an LLM enhanced with augmentations such as retrieval, tools, and memory. Our current models can actively use these capabilities—generating their own search queries, selecting appropriate tools, and determining what information to retain.
+
+
+
+#### The augmented LLM
+
+The prompt chaining workflow
+
+We recommend focusing on two key aspects of the implementation: tailoring these capabilities to your specific use case and ensuring they provide an easy, well-documented interface for your LLM. While there are many ways to implement these augmentations, one approach is through our recently released Model Context Protocol, which allows developers to integrate with a growing ecosystem of third-party tools with a simple client implementation.
+
+For the remainder of this post, we'll assume each LLM call has access to these augmented capabilities.
+
+#### Workflow: Prompt chaining
+
+Prompt chaining decomposes a task into a sequence of steps, where each LLM call processes the output of the previous one. You can add programmatic checks (see "gate" in the diagram below) on any intermediate steps to ensure that the process is still on track.
+
+
+
+When to use this workflow: This workflow is ideal for situations where the task can be easily and cleanly decomposed into fixed subtasks. The main goal is to trade off latency for higher accuracy, by making each LLM call an easier task.
+
+Examples where prompt chaining is useful:
+
+Generating Marketing copy, then translating it into a different
+
+- language. Writing an outline of a document, checking that the outline
+- meets certain criteria, then writing the document based on the outline.
+
+#### Workflow: Routing
+
+Routing classifies an input and directs it to a specialized followup task. This workflow allows for separation of concerns, and building more specialized prompts. Without this workflow, optimizing for one kind of input can hurt performance on other inputs.
+
+
+
+The routing workflow
+
+When to use this workflow: Routing works well for complex tasks where there are distinct categories that are better handled separately, and where classification can be handled accurately, either by an LLM or a more traditional classification model/algorithm.
+
+Examples where routing is useful:
+
+- Directing different types of customer service queries (general questions, refund requests, technical support) into different
+- downstream processes, prompts, and tools. Routing easy/common questions to smaller models like Claude 3.5 Haiku and hard/unusual questions to more capable models
+- like Claude 3.5 Sonnet to optimize cost and speed.
+
+#### Workflow: Parallelization
+
+LLMs can sometimes work simultaneously on a task and have their outputs aggregated programmatically. This workflow, parallelization, manifests in two key variations:
+
+- Sectioning: Breaking a task into independent subtasks run in parallel.
+- Voting: Running the same task multiple times to get diverse outputs.
+
+| 7 | LLM Call 1 | 7 | | |
+| --- | --- | --- | --- | --- |
+| > In | LLM Call 2 | > | Aggregator | Out 1 |
+| 1 | LLM Call 3 | 기 | | |
+
+The parallelization workflow
+
+divided subtasks can be parallelized for speed, or when multiple perspectives or attempts are needed for higher confidence results. For complex tasks with multiple considerations, LLMs generally perform better when each consideration is handled by a separate LLM call, allowing focused attention on each specific aspect.
+
+Examples where parallelization is useful:
+
+#### Sectioning:
+
+- Implementing guardrails where one model instance processes user queries while another screens them for inappropriate content or requests. This tends to perform better than having the same LLM call handle both guardrails and the core response.
+- Automating evals for evaluating LLM performance, where each LLM call evaluates a different aspect of the model's performance on a given prompt.
+- Voting:
+- Reviewing a piece of code for vulnerabilities, where several different prompts review and flag the code if they find a problem.
+- Evaluating whether a given piece of content is inappropriate, with multiple prompts evaluating different aspects or requiring different vote thresholds to balance false positives and negatives.
+
+#### Workflow: Orchestrator-workers
+
+In the orchestrator-workers workflow, a central LLM dynamically breaks down tasks, delegates them to worker LLMs, and synthesizes their results.
+
+
+
+The orchestrator-workers workflow
+
+When to use this workflow: This workflow is well-suited for complex tasks where you can't predict the subtasks needed (in coding, for example, the number of files that need to be changed and the nature of the change in each file likely depend on the task). Whereas it's topographically similar, the key difference from parallelization is its flexibility—subtasks aren't pre-defined, but determined by the orchestrator based on the specific input.
+
+Example where orchestrator-workers is useful:
+
+- Coding products that make complex changes to multiple files each time.
+- Search tasks that involve gathering and analyzing information from multiple sources for possible relevant information.
+
+#### Workflow: Evaluator-optimizer
+
+In the evaluator-optimizer workflow, one LLM call generates a response while another provides evaluation and feedback in a loop.
+
+
+
+#### The evaluator-optimizer workflow
+
+When to use this workflow: This workflow is particularly effective when we have clear evaluation criteria, and when iterative
+
+refinement provides measurable value. The two signs of good fit are, first, that LLM responses can be demonstrably improved when a human articulates their feedback; and second, that the LLM can provide such feedback. This is analogous to the iterative writing process a human writer might go through when producing a polished document.
+
+#### Examples where evaluator-optimizer is useful:
+
+- Literary translation where there are nuances that the translator LLM might not capture initially, but where an evaluator LLM can provide useful critiques.
+- Complex search tasks that require multiple rounds of searching and analysis to gather comprehensive information, where the evaluator decides whether further searches are warranted.
+
+#### Agents
+
+Agents are emerging in production as LLMs mature in key capabilities—understanding complex inputs, engaging in reasoning and planning, using tools reliably, and recovering from errors. Agents begin their work with either a command from, or interactive discussion with, the human user. Once the task is clear, agents plan and operate independently, potentially returning to the human for further information or judgement. During execution, it's crucial for the agents to gain "ground truth" from the environment at each step (such as tool call results or code execution) to assess its progress. Agents can then pause for human feedback at checkpoints or when encountering blockers. The task often terminates upon completion, but it's also common to include stopping conditions (such as a maximum number of iterations) to maintain control.
+
+Agents can handle sophisticated tasks, but their implementation is often straightforward. They are typically just LLMs using tools based on environmental feedback in a loop. It is therefore crucial to design toolsets and their documentation clearly and thoughtfully. We expand on best practices for tool development in Appendix 2
+
+| ("Prompt Engineering your Tools"). |
+| --- |
+
+
+
+When to use agents: Agents can be used for open-ended problems where it's difficult or impossible to predict the required number of steps, and where you can't hardcode a fixed path. The LLM will potentially operate for many turns, and you must have some level of trust in its decision-making. Agents' autonomy makes them ideal for scaling tasks in trusted environments.
+
+The autonomous nature of agents means higher costs, and the potential for compounding errors. We recommend extensive testing in sandboxed environments, along with the appropriate guardrails.
+
+Examples where agents are useful:
+
+- The following examples are from our own implementations:
+ - A coding Agent to resolve SWE-bench tasks, which involve edits to many files based on a task description;
+ - Our "computer use" reference implementation, where Claude uses a computer to accomplish tasks.
+
+|
+| |
+
+High-level flow of a coding agent
+
+Autonomous agent
+
+# Combining and customizing these patterns
+
+These building blocks aren't prescriptive. They're common patterns that developers can shape and combine to fit different use cases. The key to success, as with any LLM features, is measuring performance and iterating on implementations. To repeat: you should consider adding complexity *only* when it demonstrably improves outcomes.
+
+#### Summary
+
+Success in the LLM space isn't about building the most sophisticated system. It's about building the *right* system for your needs. Start with simple prompts, optimize them with comprehensive evaluation, and add multi-step agentic systems only when simpler solutions fall short.
+
+When implementing agents, we try to follow three core principles:
+
+- 1. Maintain simplicity in your agent's design. 2. Prioritize transparency by explicitly showing the agent's
+- planning steps.
+- 3. Carefully craft your agent-computer interface (ACI) through thorough tool documentation and testing.
+
+Frameworks can help you get started quickly, but don't hesitate to reduce abstraction layers and build with basic components as you move to production. By following these principles, you can create agents that are not only powerful but also reliable, maintainable, and trusted by their users.
+
+#### Acknowledgements
+
+Written by Erik Schluntz and Barry Zhang. This work draws upon our experiences building agents at Anthropic and the valuable insights shared by our customers, for which we're deeply grateful.
+
+#### Appendix 1: Agents in practice
+
+Our work with customers has revealed two particularly promising applications for AI agents that demonstrate the practical value of the patterns discussed above. Both applications illustrate how agents add the most value for tasks that require both conversation and action, have clear success criteria, enable feedback loops, and integrate meaningful human oversight.
+
+#### A. Customer support
+
+Customer support combines familiar chatbot interfaces with enhanced capabilities through tool integration. This is a natural fit for more open-ended agents because:
+
+- Support interactions naturally follow a conversation flow while
+- requiring access to external information and actions; Tools can be integrated to pull customer data, order history, and
+- knowledge base articles;
+- knowledge base articles;
+- Actions such as issuing refunds or updating tickets can be handled programmatically; and
+- Success can be clearly measured through user-defined resolutions.
+
+Several companies have demonstrated the viability of this approach through usage-based pricing models that charge only for successful resolutions, showing confidence in their agents' effectiveness.
+
+#### B. Coding agents
+
+The software development space has shown remarkable potential for LLM features, with capabilities evolving from code completion to autonomous problem-solving. Agents are particularly effective because:
+
+- Code solutions are verifiable through automated tests;
+- Agents can iterate on solutions using test results as feedback;
+- The problem space is well-defined and structured; and
+- Output quality can be measured objectively.
+
+In our own implementation, agents can now solve real GitHub issues in the SWE-bench Verified benchmark based on the pull request description alone. However, whereas automated testing helps verify functionality, human review remains crucial for ensuring solutions align with broader system requirements.
+
+### Appendix 2: Prompt engineering your tools
+
+No matter which agentic system you're building, tools will likely be an important part of your agent. Tools enable Claude to interact with external services and APIs by specifying their exact structure and definition in our API. When Claude responds, it will include a tool use block in the API response if it plans to invoke a tool. Tool definitions and specifications should be given just as much prompt engineering attention as your overall prompts. In this brief appendix, we describe how to prompt engineer your tools.
+
+There are often several ways to specify the same action. For instance, you can specify a file edit by writing a diff, or by rewriting the entire file. For structured output, you can return code inside markdown or inside JSON. In software engineering, differences like these are cosmetic and can be converted losslessly from one to the other. However, some formats are much more difficult for an LLM to write than others. Writing a diff requires knowing how many lines are changing in the chunk header before the new code is written. Writing code inside JSON (compared to markdown) requires extra escaping of newlines and quotes.
+
+Our suggestions for deciding on tool formats are the following:
+
+- Give the model enough tokens to "think" before it writes itself into a corner.
+- Keep the format close to what the model has seen naturally occurring in text on the internet.
+- Make sure there's no formatting "overhead" such as having to keep an accurate count of thousands of lines of code, or stringescaping any code it writes.
+-
+
+One rule of thumb is to think about how much effort goes into human-computer interfaces (HCI), and plan to invest just as much effort in creating good *agent*-computer interfaces (ACI). Here are some thoughts on how to do so:
+
+- Put yourself in the model's shoes. Is it obvious how to use this tool, based on the description and parameters, or would you need to think carefully about it? If so, then it's probably also true for the model. A good tool definition often includes example usage, edge cases, input format requirements, and clear boundaries from other tools.
+- How can you change parameter names or descriptions to make things more obvious? Think of this as writing a great docstring for a junior developer on your team. This is especially important when using many similar tools.
+- Test how the model uses your tools: Run many example inputs in our workbench to see what mistakes the model makes, and iterate.
+- Poka-yoke your tools. Change the arguments so that it is harder to make mistakes.
+
+While building our agent for SWE-bench, we actually spent more time optimizing our tools than the overall prompt. For example, we found that the model would make mistakes with tools using relative filepaths after the agent had moved out of the root directory. To fix this, we changed the tool to always require absolute filepaths—and we found that the model used this method flawlessly.
+
+#### Product
+
+Claude overview Claude Code Claude team plan Claude enterprise plan Claude education plan Download Claude apps Claude.ai pricing plans Claude.ai login
+
+#### API Platform
+
+API overview Developer docs Claude in Amazon Bedrock Claude on Google Cloud's Vertex AI Pricing
+
+Console login
+
+Research
+
+Research overview Economic Index
+
+Claude models Claude Opus 4 Claude Sonnet 4
+
+Claude Haiku 3.5
+
+Commitments Transparency Responsible scaling policy Security and compliance
+
+Solutions AI agents
+
+Coding
+
+Customer support
+
+Learn
+
+Anthropic Academy Customer stories Engineering at Anthropic MCP Integrations
+
+Explore About us Become a partner Careers
+
+Events News
+
+Startups program
+
+Help and security Status Availability
+
+Support center
+
+Terms and policies
+
+Privacy choices
+
+Privacy policy
+
+Responsible disclosure policy
+
+Terms of service consumer
+
+Terms of service commercial
+
+Usage policy
+
+© 2025 Anthropic PBC
+
+
+
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/source.pdf b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/source.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..57434d59b293976c698c908316a453cb5bdf18e0
--- /dev/null
+++ b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/source.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0353b4c9a809722eba156d0f9678ed537d40c4c06caf6de420136970757e470
+size 1708153
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/table_9f02.png b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/table_9f02.png
new file mode 100644
index 0000000000000000000000000000000000000000..1059dbdf43e638bf261b7c2cce9bedd7f99b8152
Binary files /dev/null and b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/table_9f02.png differ
diff --git a/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/table_ef93.png b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/table_ef93.png
new file mode 100644
index 0000000000000000000000000000000000000000..c22cdb0e1060d47c7c05a6d933009576109a20b3
Binary files /dev/null and b/pptagent/runs/pdf/37fd83b93256101767cb27322fba795f/table_ef93.png differ
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_0_Figure_10.jpeg b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_0_Figure_10.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..c1f66605e8b152d2f25b2cb58feec13c8252cff3
Binary files /dev/null and b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_0_Figure_10.jpeg differ
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_11_Figure_0.jpeg b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_11_Figure_0.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..dca4560ece93e2963ce63f809041d5cd2875614f
Binary files /dev/null and b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_11_Figure_0.jpeg differ
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_2_Figure_0.jpeg b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_2_Figure_0.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..204af99cec932b8dbc5a66fb9d2c3d69504a568d
Binary files /dev/null and b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_2_Figure_0.jpeg differ
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_4_Figure_0.jpeg b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_4_Figure_0.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..32e9a54e4384e36d5985e01f34c2390429c2331f
--- /dev/null
+++ b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_4_Figure_0.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b32012c8bcc8dd44500aa3c84be88660de94f20b1b3fca67a83d972b45ac6ae3
+size 164516
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_5_Figure_2.jpeg b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_5_Figure_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..0c6446c3b4192308b0e7c3e3b93c08d4ae716262
Binary files /dev/null and b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_5_Figure_2.jpeg differ
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/meta.json b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..22751f308cf0a41303071cf4b5e9d30d7f70d788
--- /dev/null
+++ b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/meta.json
@@ -0,0 +1,1081 @@
+{
+ "table_of_contents": [
+ {
+ "title": "PresentAgent: Multimodal Agent for Presentation Video Generation",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 73.7724609375,
+ 77.4471435546875
+ ],
+ [
+ 523.24609375,
+ 77.4471435546875
+ ],
+ [
+ 523.24609375,
+ 92.09521484375
+ ],
+ [
+ 73.7724609375,
+ 92.09521484375
+ ]
+ ]
+ },
+ {
+ "title": "Abstract",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 157.75796508789062,
+ 221.8001708984375
+ ],
+ [
+ 204.875,
+ 221.8001708984375
+ ],
+ [
+ 204.875,
+ 233.75537109375
+ ],
+ [
+ 157.75796508789062,
+ 233.75537109375
+ ]
+ ]
+ },
+ {
+ "title": "1 Introduction",
+ "heading_level": null,
+ "page_id": 0,
+ "polygon": [
+ [
+ 70.753173828125,
+ 648.191162109375
+ ],
+ [
+ 154.8203125,
+ 648.191162109375
+ ],
+ [
+ 154.8203125,
+ 660.279296875
+ ],
+ [
+ 70.753173828125,
+ 660.279296875
+ ]
+ ]
+ },
+ {
+ "title": "2 Presentation Benchmark",
+ "heading_level": null,
+ "page_id": 1,
+ "polygon": [
+ [
+ 304.693359375,
+ 362.5001525878906
+ ],
+ [
+ 451.365234375,
+ 362.5001525878906
+ ],
+ [
+ 451.365234375,
+ 374.4553527832031
+ ],
+ [
+ 304.693359375,
+ 374.4553527832031
+ ]
+ ]
+ },
+ {
+ "title": "2.1 Doc2Present Dataset",
+ "heading_level": null,
+ "page_id": 2,
+ "polygon": [
+ [
+ 69.880126953125,
+ 327.6728515625
+ ],
+ [
+ 192.361328125,
+ 327.6728515625
+ ],
+ [
+ 192.361328125,
+ 339.4931945800781
+ ],
+ [
+ 69.880126953125,
+ 339.4931945800781
+ ]
+ ]
+ },
+ {
+ "title": "2.2 PresentEval",
+ "heading_level": null,
+ "page_id": 2,
+ "polygon": [
+ [
+ 70.389404296875,
+ 543.1064453125
+ ],
+ [
+ 152.6376953125,
+ 543.1064453125
+ ],
+ [
+ 152.6376953125,
+ 554.7662048339844
+ ],
+ [
+ 70.389404296875,
+ 554.7662048339844
+ ]
+ ]
+ },
+ {
+ "title": "3 PresentAgent",
+ "heading_level": null,
+ "page_id": 3,
+ "polygon": [
+ [
+ 70.607666015625,
+ 137.81512451171875
+ ],
+ [
+ 158.603515625,
+ 137.81512451171875
+ ],
+ [
+ 158.603515625,
+ 149.77032470703125
+ ],
+ [
+ 70.607666015625,
+ 149.77032470703125
+ ]
+ ]
+ },
+ {
+ "title": "3.1 Problem Formulation",
+ "heading_level": null,
+ "page_id": 3,
+ "polygon": [
+ [
+ 70.243896484375,
+ 345.55712890625
+ ],
+ [
+ 196.45156860351562,
+ 345.55712890625
+ ],
+ [
+ 196.45156860351562,
+ 356.65771484375
+ ],
+ [
+ 70.243896484375,
+ 356.65771484375
+ ]
+ ]
+ },
+ {
+ "title": "3.2 Slide Planning and Composition",
+ "heading_level": null,
+ "page_id": 3,
+ "polygon": [
+ [
+ 305.56640625,
+ 123.43804931640625
+ ],
+ [
+ 480.73126220703125,
+ 123.43804931640625
+ ],
+ [
+ 480.73126220703125,
+ 134.954345703125
+ ],
+ [
+ 305.56640625,
+ 134.954345703125
+ ]
+ ]
+ },
+ {
+ "title": "3.3 Narration and Audio Synthesis",
+ "heading_level": null,
+ "page_id": 3,
+ "polygon": [
+ [
+ 306.14202880859375,
+ 516.3828125
+ ],
+ [
+ 475.228515625,
+ 516.3828125
+ ],
+ [
+ 475.228515625,
+ 527.3302001953125
+ ],
+ [
+ 306.14202880859375,
+ 527.3302001953125
+ ]
+ ]
+ },
+ {
+ "title": "3.4 Video Assembly",
+ "heading_level": null,
+ "page_id": 4,
+ "polygon": [
+ [
+ 70.389404296875,
+ 454.712890625
+ ],
+ [
+ 169.85516357421875,
+ 454.712890625
+ ],
+ [
+ 169.85516357421875,
+ 466.224609375
+ ],
+ [
+ 70.389404296875,
+ 466.224609375
+ ]
+ ]
+ },
+ {
+ "title": "4 Experiments",
+ "heading_level": null,
+ "page_id": 4,
+ "polygon": [
+ [
+ 70.2802734375,
+ 673.435546875
+ ],
+ [
+ 154.529296875,
+ 673.435546875
+ ],
+ [
+ 154.529296875,
+ 685.9183502197266
+ ],
+ [
+ 70.2802734375,
+ 685.9183502197266
+ ]
+ ]
+ },
+ {
+ "title": "4.1 Main Results",
+ "heading_level": null,
+ "page_id": 4,
+ "polygon": [
+ [
+ 304.984375,
+ 524.1943359375
+ ],
+ [
+ 392.20391845703125,
+ 524.1943359375
+ ],
+ [
+ 392.20391845703125,
+ 535.7060546875
+ ],
+ [
+ 304.984375,
+ 535.7060546875
+ ]
+ ]
+ },
+ {
+ "title": "4.2 Analysis",
+ "heading_level": null,
+ "page_id": 5,
+ "polygon": [
+ [
+ 70.607666015625,
+ 661.1015625
+ ],
+ [
+ 135.4677734375,
+ 661.1015625
+ ],
+ [
+ 135.4677734375,
+ 673.0631561279297
+ ],
+ [
+ 70.607666015625,
+ 673.0631561279297
+ ]
+ ]
+ },
+ {
+ "title": "5 Conclusion",
+ "heading_level": null,
+ "page_id": 5,
+ "polygon": [
+ [
+ 303.529296875,
+ 534.0615234375
+ ],
+ [
+ 381.208740234375,
+ 534.0615234375
+ ],
+ [
+ 381.208740234375,
+ 546.3853454589844
+ ],
+ [
+ 303.529296875,
+ 546.3853454589844
+ ]
+ ]
+ },
+ {
+ "title": "References",
+ "heading_level": null,
+ "page_id": 6,
+ "polygon": [
+ [
+ 70.06201171875,
+ 72.56494140625
+ ],
+ [
+ 126.95556640625,
+ 72.56494140625
+ ],
+ [
+ 126.95556640625,
+ 84.71435546875
+ ],
+ [
+ 70.06201171875,
+ 84.71435546875
+ ]
+ ]
+ },
+ {
+ "title": "A Related Work",
+ "heading_level": null,
+ "page_id": 9,
+ "polygon": [
+ [
+ 70.243896484375,
+ 71.845458984375
+ ],
+ [
+ 162.61021423339844,
+ 71.845458984375
+ ],
+ [
+ 162.61021423339844,
+ 84.71435546875
+ ],
+ [
+ 70.243896484375,
+ 84.71435546875
+ ]
+ ]
+ },
+ {
+ "title": "A.1 Document-to-Multimodal Generation",
+ "heading_level": null,
+ "page_id": 9,
+ "polygon": [
+ [
+ 70.2802734375,
+ 93.73828125
+ ],
+ [
+ 272.6625061035156,
+ 93.73828125
+ ],
+ [
+ 272.6625061035156,
+ 106.1451416015625
+ ],
+ [
+ 70.2802734375,
+ 106.1451416015625
+ ]
+ ]
+ },
+ {
+ "title": "A.2 Vision-Language Agents",
+ "heading_level": null,
+ "page_id": 9,
+ "polygon": [
+ [
+ 70.098388671875,
+ 393.24853515625
+ ],
+ [
+ 213.4599609375,
+ 393.24853515625
+ ],
+ [
+ 213.4599609375,
+ 405.17138671875
+ ],
+ [
+ 70.098388671875,
+ 405.17138671875
+ ]
+ ]
+ },
+ {
+ "title": "B Implementation Details",
+ "heading_level": null,
+ "page_id": 9,
+ "polygon": [
+ [
+ 306.14202880859375,
+ 398.18212890625
+ ],
+ [
+ 447.58203125,
+ 398.18212890625
+ ],
+ [
+ 447.58203125,
+ 411.3153381347656
+ ],
+ [
+ 306.14202880859375,
+ 411.3153381347656
+ ]
+ ]
+ },
+ {
+ "title": "C Discussion",
+ "heading_level": null,
+ "page_id": 10,
+ "polygon": [
+ [
+ 70.35302734375,
+ 150.988525390625
+ ],
+ [
+ 146.59912109375,
+ 150.988525390625
+ ],
+ [
+ 146.59912109375,
+ 164.32232666015625
+ ],
+ [
+ 70.35302734375,
+ 164.32232666015625
+ ]
+ ]
+ },
+ {
+ "title": "D Limitations",
+ "heading_level": null,
+ "page_id": 10,
+ "polygon": [
+ [
+ 70.316650390625,
+ 631.5
+ ],
+ [
+ 151.4736328125,
+ 631.5
+ ],
+ [
+ 151.4736328125,
+ 645.5033721923828
+ ],
+ [
+ 70.316650390625,
+ 645.5033721923828
+ ]
+ ]
+ },
+ {
+ "title": "E Evaluation Benchmark",
+ "heading_level": null,
+ "page_id": 10,
+ "polygon": [
+ [
+ 305.56640625,
+ 149.960693359375
+ ],
+ [
+ 445.8359375,
+ 149.960693359375
+ ],
+ [
+ 445.8359375,
+ 163.23040771484375
+ ],
+ [
+ 305.56640625,
+ 163.23040771484375
+ ]
+ ]
+ },
+ {
+ "title": "F Doc2Present Dataset Details",
+ "heading_level": null,
+ "page_id": 10,
+ "polygon": [
+ [
+ 305.56640625,
+ 358.0966796875
+ ],
+ [
+ 470.86328125,
+ 358.0966796875
+ ],
+ [
+ 470.86328125,
+ 371.4473571777344
+ ],
+ [
+ 305.56640625,
+ 371.4473571777344
+ ]
+ ]
+ },
+ {
+ "title": "G PresentEval",
+ "heading_level": null,
+ "page_id": 10,
+ "polygon": [
+ [
+ 306.14202880859375,
+ 654.9345703125
+ ],
+ [
+ 390.833984375,
+ 654.9345703125
+ ],
+ [
+ 390.833984375,
+ 668.8763580322266
+ ],
+ [
+ 306.14202880859375,
+ 668.8763580322266
+ ]
+ ]
+ },
+ {
+ "title": "G.1 Prompts of Objective Quiz Evaluation",
+ "heading_level": null,
+ "page_id": 10,
+ "polygon": [
+ [
+ 304.693359375,
+ 676.724609375
+ ],
+ [
+ 513.642578125,
+ 676.724609375
+ ],
+ [
+ 513.642578125,
+ 689.9411468505859
+ ],
+ [
+ 304.693359375,
+ 689.9411468505859
+ ]
+ ]
+ },
+ {
+ "title": "G.2 Prompts of Subjective Scoring",
+ "heading_level": null,
+ "page_id": 11,
+ "polygon": [
+ [
+ 70.35302734375,
+ 677.1357421875
+ ],
+ [
+ 240.9609375,
+ 677.1357421875
+ ],
+ [
+ 240.9609375,
+ 689.4697265625
+ ],
+ [
+ 70.35302734375,
+ 689.4697265625
+ ]
+ ]
+ },
+ {
+ "title": "H Evaluation Setup",
+ "heading_level": null,
+ "page_id": 11,
+ "polygon": [
+ [
+ 304.984375,
+ 682.0693359375
+ ],
+ [
+ 416.734375,
+ 682.0693359375
+ ],
+ [
+ 416.734375,
+ 696.0478515625
+ ],
+ [
+ 304.984375,
+ 696.0478515625
+ ]
+ ]
+ }
+ ],
+ "page_stats": [
+ {
+ "page_id": 0,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 258
+ ],
+ [
+ "Line",
+ 94
+ ],
+ [
+ "Text",
+ 10
+ ],
+ [
+ "SectionHeader",
+ 3
+ ],
+ [
+ "Figure",
+ 1
+ ],
+ [
+ "Caption",
+ 1
+ ],
+ [
+ "PageFooter",
+ 1
+ ],
+ [
+ "FigureGroup",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 1,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 205
+ ],
+ [
+ "Line",
+ 101
+ ],
+ [
+ "Text",
+ 8
+ ],
+ [
+ "ListItem",
+ 4
+ ],
+ [
+ "SectionHeader",
+ 1
+ ],
+ [
+ "PageFooter",
+ 1
+ ],
+ [
+ "ListGroup",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 2,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 205
+ ],
+ [
+ "Line",
+ 103
+ ],
+ [
+ "Text",
+ 6
+ ],
+ [
+ "SectionHeader",
+ 2
+ ],
+ [
+ "Figure",
+ 1
+ ],
+ [
+ "Caption",
+ 1
+ ],
+ [
+ "PageFooter",
+ 1
+ ],
+ [
+ "FigureGroup",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 3,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 273
+ ],
+ [
+ "Line",
+ 97
+ ],
+ [
+ "Text",
+ 11
+ ],
+ [
+ "SectionHeader",
+ 4
+ ],
+ [
+ "Equation",
+ 2
+ ],
+ [
+ "TextInlineMath",
+ 1
+ ],
+ [
+ "PageFooter",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 4,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 274
+ ],
+ [
+ "Line",
+ 121
+ ],
+ [
+ "Text",
+ 10
+ ],
+ [
+ "SectionHeader",
+ 3
+ ],
+ [
+ "Figure",
+ 1
+ ],
+ [
+ "Caption",
+ 1
+ ],
+ [
+ "PageFooter",
+ 1
+ ],
+ [
+ "FigureGroup",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 5,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 211
+ ],
+ [
+ "Line",
+ 77
+ ],
+ [
+ "Text",
+ 5
+ ],
+ [
+ "SectionHeader",
+ 2
+ ],
+ [
+ "Table",
+ 1
+ ],
+ [
+ "Figure",
+ 1
+ ],
+ [
+ "Caption",
+ 1
+ ],
+ [
+ "PageFooter",
+ 1
+ ],
+ [
+ "FigureGroup",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 6,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 257
+ ],
+ [
+ "Line",
+ 110
+ ],
+ [
+ "ListItem",
+ 22
+ ],
+ [
+ "ListGroup",
+ 2
+ ],
+ [
+ "SectionHeader",
+ 1
+ ],
+ [
+ "PageFooter",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 7,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 261
+ ],
+ [
+ "Line",
+ 111
+ ],
+ [
+ "ListItem",
+ 22
+ ],
+ [
+ "ListGroup",
+ 2
+ ],
+ [
+ "PageFooter",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 8,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 81
+ ],
+ [
+ "Line",
+ 32
+ ],
+ [
+ "ListItem",
+ 6
+ ],
+ [
+ "Text",
+ 1
+ ],
+ [
+ "PageFooter",
+ 1
+ ],
+ [
+ "ListGroup",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 9,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 235
+ ],
+ [
+ "Line",
+ 101
+ ],
+ [
+ "Text",
+ 9
+ ],
+ [
+ "SectionHeader",
+ 4
+ ],
+ [
+ "PageFooter",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 10,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 180
+ ],
+ [
+ "Line",
+ 97
+ ],
+ [
+ "Text",
+ 11
+ ],
+ [
+ "SectionHeader",
+ 6
+ ],
+ [
+ "PageFooter",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 11,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 112
+ ],
+ [
+ "Line",
+ 60
+ ],
+ [
+ "Text",
+ 5
+ ],
+ [
+ "SectionHeader",
+ 2
+ ],
+ [
+ "Figure",
+ 1
+ ],
+ [
+ "Caption",
+ 1
+ ],
+ [
+ "Table",
+ 1
+ ],
+ [
+ "PageFooter",
+ 1
+ ],
+ [
+ "FigureGroup",
+ 1
+ ]
+ ]
+ },
+ {
+ "page_id": 12,
+ "text_extraction_method": "pdftext",
+ "block_counts": [
+ [
+ "Span",
+ 77
+ ],
+ [
+ "Line",
+ 36
+ ],
+ [
+ "Text",
+ 5
+ ],
+ [
+ "Table",
+ 1
+ ],
+ [
+ "PageFooter",
+ 1
+ ]
+ ]
+ }
+ ],
+ "debug_data_path": "debug_data/source"
+}
\ No newline at end of file
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/refined_doc.json b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/refined_doc.json
new file mode 100644
index 0000000000000000000000000000000000000000..0678ac20d75ea6ed034be840eb09be2e775c2aeb
--- /dev/null
+++ b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/refined_doc.json
@@ -0,0 +1,608 @@
+{
+ "image_dir": "/Users/shijingwei/Desktop/PresentAgent/presentagent/../pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e",
+ "sections": [
+ {
+ "title": "PresentAgent: Multimodal Agent for Presentation Video Generation",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Authors and Affiliations",
+ "content": "Jingwei Shi1∗ Zeyu Zhang1∗† Biao Wu2∗ Yanjie Liang1∗\n\nMeng Fang3 Ling Chen2 Yang Zhao4‡\n\n1AI Geeks, Australia\n\n2Australian Artificial Intelligence Institute, Australia 3University of Liverpool, United Kingdom 4La Trobe University, Australia\n\n∗Equal contribution. † Project lead. ‡Corresponding author: y.zhao2@latrobe.edu.au.",
+ "medias": []
+ },
+ {
+ "title": "Abstract",
+ "content": "We present PresentAgent, a multimodal agent that transforms long-form documents into narrated presentation videos. While existing approaches are limited to generating static slides or text summaries, our method advances beyond these limitations by producing fully synchronized visual and spoken content that closely mimics human-style presentations. To achieve this integration, PresentAgent employs a modular pipeline that systematically segments the input document, plans and renders slide-style visual frames, generates contextual spoken narration with large language models and Text-to-Speech models, and seamlessly composes the final video with precise audiovisual alignment. Given the complexity of evaluating such multimodal outputs, we introduce PresentEval, a unified assessment framework powered by Vision-Language Models that comprehensively scores videos across three critical dimensions: content fidelity, visual clarity, and audience comprehension through prompt-based evaluation. Our experimental validation on a curated dataset of 30 document–presentation pairs demonstrates that PresentAgent approaches human-level quality across all evaluation metrics. These results highlight the significant potential of controllable multimodal agents in transforming static textual materials into dynamic, effective, and accessible presentation formats. Code will be available at https://github.com/ AIGeeksGroup/PresentAgent.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Introduction",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Presentation importance",
+ "content": "Presentations are a widely used and effective medium for conveying complex ideas. By combining visual elements, structured narration, and spoken explanations, they enable information to unfold progressively and be more easily understood by diverse audiences (Fu et al., 2022). Despite their proven effectiveness, creating high-quality presentation videos from long-form documents—such as business reports, technical manuals, policy briefs, or academic papers—typically requires considerable manual effort (Li et al., 2023). This process involves identifying key content, designing slide layouts, writing scripts, recording narration, and aligning all elements into a coherent multimodal output.",
+ "medias": [
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "Presentations are a widely used and effective medium for conveying complex ideas. By combining visual elements, structured narration, and spoken explanations, they enable information to unfold progressively and be more easily understood by diverse audiences (Fu et al., 2022). Despite their proven effectiveness, creating high-quality presentation videos from long-form documents—such as\n\n",
+ "Figure 1: Overview of PresentAgent. It takes documents (e.g., web pages) as input and follows a generation pipeline: (1) document processing, (2) structured slide generation, (3) synchronized caption creation, and (4) audio synthesis. The final output is a presentation video combining visual slides with aligned narration. The purple-highlighted middle results emphasize the system's key transitional outputs during generation.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/presentagent/../pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_0_Figure_10.jpeg",
+ "caption": "Diagram: Overview of PresentAgent system showing a four-step pipeline that transforms documents into presentation videos, with input documents on the left, middle processing results (slides, captions, audio) in purple, and final presentation video output on the right."
+ }
+ ]
+ },
+ {
+ "title": "Current AI limitations",
+ "content": "Although recent advancements in AI have enabled progress in related areas such as documentto-slide generation (Fu et al., 2022; Zheng et al., 2025a; Pang et al., 2025; Zhang et al., 2024) and text-to-video synthesis (Yang et al., 2024c; Li et al., 2023; Xue et al., 2025; Khachatryan et al., 2023; He et al., 2023; Solanki and Khublani, 2024), a critical gap remains: these methods either produce static visual summaries or generic video clips without structured narration, limiting their effectiveness for structured communication tasks like presentations.",
+ "medias": []
+ },
+ {
+ "title": "Document-to-Presentation Video Generation",
+ "content": "To bridge this gap, we introduce the task of Document-to-Presentation Video Generation, which aims to automatically convert a structured or unstructured document into a narrated video presentation composed of synchronized slides and speech. This task presents unique challenges as it goes beyond traditional summarization (Lewis et al., 2019; Beltagy et al., 2020; Chen and Yang, 2021; Wang et al., 2024a) or text-to-speech (Tachibana et al., 2018; Ren et al., 2019; Popov et al., 2021; Ni et al., 2022) pipelines by requiring selective content abstraction, layout-aware planning (Wang et al., 2025), and precise multimodal alignment (Li et al., 2024) between visuals and narration. In contrast to prior work that focuses on either static slide and image generation (Zheng et al., 2025a; Deng et al., 2025; Xie et al., 2024) or audio summarization in isolation, our objective is to produce a fully integrated, viewer-ready video experience that closely mimics how human presenters deliver information in real-world scenarios.",
+ "medias": []
+ },
+ {
+ "title": "PresentAgent framework",
+ "content": "To tackle these challenges, we propose a modular generation framework named PresentAgent. Given an input document, the system first segments it into semantic blocks through outline planning, then generates layout-guided slide visuals for each block and rewrites the key message into oral-style narration. Subsequently, these are then synthesized into audio and combined with the slide visuals to produce a time-aligned presentation video. Importantly, our pipeline is designed to be domainadaptable and controllable, enabling broad applicability across document types and presentation styles.",
+ "medias": []
+ },
+ {
+ "title": "Evaluation approach",
+ "content": "Recognizing the need for rigorous evaluation of such complex multimodal outputs, we curate a test set of 30 human-authored document-video pairs spanning diverse domains, including education, finance, policy, and scientific communication. To comprehensively assess system performance, we further introduce a two-path evaluation strategy that combines fact-based comprehension assessment (via fixed multiple-choice quizzes) and preference-based scoring using vision-language models. This dual-pronged approach captures both objective correctness and subjective quality in video delivery.",
+ "medias": []
+ },
+ {
+ "title": "Results and findings",
+ "content": "Experiment results demonstrate that our method produces fluent, well-structured, and informative presentation videos, approaching human-level performance in both content delivery and viewer comprehension. These findings highlight the potential of combining language models, layout generation, and multimodal synthesis for creating explainable and scalable presentation systems from raw documents.",
+ "medias": []
+ },
+ {
+ "title": "Key contributions",
+ "content": "In general, our contributions are summarized as follows:\n\n- We formulate and address the novel task of document-to-presentation video generation, which aims to produce narrated, slide-structured videos from long-form documents across diverse domains.\n\n- We propose PresentAgent, a modular generation framework that integrates document parsing, layout-aware slide composition, narration planning, and audio-visual synchronization, enabling controllable and interpretable generation.\n- We introduce PresentEval, a multi-dimensional evaluation framework powered by Vision-Language Models (VLMs), which scores videos along content, visual, and comprehension dimensions via prompt-based judging.\n- We create a test set of 30 real-world document–presentation pairs and demonstrate through experiments and ablations that PresentAgent approaches human-level performance and significantly outperforms competitive variants.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Presentation Benchmark",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Benchmark Overview",
+ "content": "The benchmark supports evaluation not only of fluency and fidelity, but also of downstream comprehension. Following the methodology introduced in Paper2Poster (Pang et al., 2025), we construct a quiz-style evaluation protocol (§5), where vision-language models are asked to answer factual content questions using only the generated video (slides + narration), simulating an audience's understanding. Human-authored videos are used as reference standards for both score calibration and upperbound comparison. As shown in Figure 5, our benchmark encompasses four representative document types (academic papers, web pages, technical blogs, and slides) paired with human-authored videos, covering diverse real-world domains like education, research, and business reports.\n\nWe adopt a unified, model-based evaluation framework to assess the generated presentation videos. All evaluations are conducted using a vision-language model, guided by dimensionspecific prompts tailored to different assessment objectives. The framework consists of two complementary components: (1) objective quiz evaluation, which measures factual accuracy through multiplechoice question answering; and (2) subjective scoring, which rates Content Quality, Visual or Audio Quality, and Comprehension Clarity on a 1–5 scale. Together, these metrics provide a comprehensive assessment of both the quality and informativeness of the generated videos.",
+ "medias": [
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "We adopt a unified, model-based evaluation framework to assess the generated presentation videos. All evaluations are conducted using a vision-language model, guided by dimensionspecific prompts tailored to different assessment objectives. The framework consists of two complementary components: (1) objective quiz evaluation, which measures factual accuracy through multiplechoice question answering; and (2) subjective scoring, which rates Content Quality, Visual or Audio Quality, and Comprehension Clarity on a 1–5 scale. Together, these metrics provide a comprehensive assessment of both the quality and informativeness\n\n",
+ "Figure 2: Overview of our framework. Our approach addresses the full pipeline of document-to-presentation video generation and evaluation. Left: Given diverse input documents—including papers, websites, blogs, slides, and PDFs—PresentAgent generates narrated presentation videos by producing synchronized slide decks with audio. Right: To evaluate these videos, we introduce PresentEval, a two-part evaluation framework: (1) Objective Quiz Evaluation (top), which measures factual comprehension using Qwen-VL; and (2) Subjective Scoring (bottom), which uses vision-language models to rate content quality, visual design, and audio comprehension across predefined dimensions.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/presentagent/../pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_2_Figure_0.jpeg",
+ "caption": "Diagram: Framework overview showing PresentAgent's document-to-video pipeline. Left shows input documents (papers, websites, blogs), center displays the video creation process, and right illustrates PresentEval's two-part evaluation system with objective quiz evaluation and subjective scoring metrics."
+ }
+ ]
+ },
+ {
+ "title": "Doc2Present Dataset",
+ "content": "To support the evaluation of document to presentation video generation, we curate the Doc2Present Benchmark, a diverse dataset of document–presentation video pairs spanning multiple domains. Unlike prior benchmarks focused on research abstracts or slide generation, our dataset includes documents such as business reports, product manuals, policy briefs, and instructional texts, each paired with a human-crafted presentation video.We collect 30 high-quality video samples from public platforms, educational repositories, and professional presentation archives, further details regarding the data sources and statistical information of the dataset can be found in the appendix F.",
+ "medias": []
+ },
+ {
+ "title": "PresentEval",
+ "content": "To assess the quality of generated presentation videos, we adopt two complementary evaluation strategies: Objective Quiz Evaluation and Subjective Scoring. For each video, we provide the visionlanguage model with the complete set of slide images and the full narration transcript as a unified input—simulating how a real viewer would experience the presentation. In Objective Quiz Evaluation, the model answers a fixed set of factual questions to determine whether the video accurately conveys the key information from the source content. In Subjective Scoring, the model evaluates the video along three dimensions: the coherence of the narration, the clarity and design of the visuals, and the overall ease of understanding. All evaluations are conducted without ground-truth references and rely entirely on the model's interpretation of the presented content.\n\nObjective Quiz Evaluation To evaluate whether a generated presentation video effectively conveys the core content of its source document, we use a fixed-question comprehension evaluation protocol. Specifically, we manually design five multiplechoice questions for each document, tailored to its content. These questions focus on key aspects such as topic recognition, structural understanding, and main argument extraction. As shown in Table 2, during evaluation, a vision-language model is given the video, including both visual frames and audio transcript, and asked to answer the five questions. Each question has four options, with one correct answer, annotated based on a human-created reference video. The final comprehension score (ranging from 0 to 5) reflects how many questions the model answered correctly, serving as a direct measure of how well the video communicates the original document.\n\nSubjective Scoring To evaluate the quality of generated presentation videos, we adopt a promptbased assessment using vision-language models. Instead of relying on human references or fixed metrics, we ask the model to evaluate each video from a viewer's perspective, using its own reasoning and preferences. The evaluation focuses on three aspects: coherence of narration, clarity and aesthetics of visuals, and overall ease of understanding. The model is shown the video and audio, and gives a score (1–5) with a brief explanation for each aspect. This enables scalable, consistent, and human-aligned evaluation without manual references. As shown in Table 3, we design different prompts for different modalities and tasks to ensure targeted and effective assessment.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "PresentAgent",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Overview",
+ "content": "To convert a long-form document into a narrated presentation video, we design a multi-stage generation framework that mirrors how human presenters prepare slides and talk tracks. Our method proceeds in four steps: segmenting the document into semantic units, composing slides with layout-aware structures, generating oral-style narration for each slide and assembling the visual and audio components into a synchronized video. This modular design supports controllability, interpretability, and multimodal alignment, enabling both high-quality generation and fine-grained evaluation. The following sections describe each component in detail.",
+ "medias": [
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "For each content block corresponding to a slide, we prompt a language model to generate a concise, oral-style narration. The model is instructed to rewrite the key message of the slide into natural spoken language, avoiding dense text or technical jargon. We apply length control to ensure each narration falls within a target duration, typically between 30 and 150 seconds. Once the narration script is obtained, we synthesize the corresponding audio using a text-to-speech system. Each narration audio is paired with its slide and timestamped, forming the basis for synchronized video rendering in the next stage.\n\n",
+ "Figure 3: Overview of the PresentAgent framework. Our system takes diverse documents (e.g., papers, websites, PDFs) as input and follows a modular generation pipeline. It first performs outline generation (Step 1) and retrieves the most suitable template (Step 2), then generates slides and narration notes via a vision-language model (Step 3). The notes are converted into audio via TTS and composed into a presentation video (Step 4). To evaluate video quality, we design multiple prompts (Step 5) and feed them into a VLM-based scoring pipeline (Step 6) that outputs dimension-specific metrics.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/presentagent/../pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_4_Figure_0.jpeg",
+ "caption": "Diagram: Comprehensive flowchart of the PresentAgent framework showing a six-step process from input documents through generation pipeline (outline creation, template selection, slides/notes generation, video production) to evaluation metrics with visual icons representing each component."
+ }
+ ]
+ },
+ {
+ "title": "Problem Formulation",
+ "content": "Our method is designed to transform a long-form document into a structured presentation video through a multi-stage generation pipeline. We provide a formal description to highlight the key difference between our approach and conventional slide-based methods.\n\nConventional approaches often focus on generating slide elements S directly from a document chunk C, as in Equation 1, where each element includes text or image content, layout attributes, and visual style:\n\n$$S=\\{e_{1},e_{2},...,e_{n}\\}=f(C)\\qquad\\quad(1)$$\n\nIn contrast, we treat the entire document D as a globally structured input and generate a presentation in three steps: (1) a sequence of semantic segments {C1, ..., CK} via outline planning, (2) a set of slides {S1, ..., SK}, each paired with a narrated audio track Tk generated by first producing a slide-specific script and then converting it to speech, and (3) a video V composed of visual and audio content aligned over time. This is defined as:\n\n$V=$ **Compose($\\{(S_{1},T_{1}),...,(S_{K},T_{K})\\})=g(D)$**\n\nRather than editing predefined templates or layouts, our system first identifies high-level structure in the document and then generates slide visuals and narration from scratch. This pipeline\n\nsupports controllability, modular evaluation, and multimodal alignment for downstream comprehension and quality assessment.",
+ "medias": []
+ },
+ {
+ "title": "Slide Planning and Composition",
+ "content": "Our slide generation module is inspired by the editing-based paradigm proposed in PPTAgent (Zheng et al., 2025b), which formulates presentation construction as a structured editing process over HTML-like layouts. While PPTAgent focuses on producing editable .pptx slides, our goal is to generate visually coherent, narrationready slide frames for downstream video synthesis. We re-implement the core idea in a self-contained pipeline tailored to multimodal synchronization.\n\nWe begin by segmenting the input document into coherent content blocks using a lightweight LLM-based parser. Each block is assigned a corresponding slide type such as bullet slide, figuredescription, or title-intro, and matched with a predefined layout schema encoded in HTML. Unlike retrieval-based template matching, our system uses semantic and structural cues to map content to layout patterns in a rule-guided manner.\n\nTo populate the slide, we define a set of editable operations such as replace_text, insert_image, and add_list, which are applied to the layout structure. These instructions are generated by prompting a language model with the content block and layout constraints. Slides are then rendered into static visual frames using python-pptx or HTML-based renderers.",
+ "medias": []
+ },
+ {
+ "title": "Narration and Audio Synthesis",
+ "content": "To transform the static slides into an engaging presentation, we generate a spoken narration for each slide and synthesize it into audio. The process involves two components: narration script generation and text-to-speech synthesis.\n\nFor each content block corresponding to a slide, we prompt a language model to generate a concise, oral-style narration. The model is instructed to rewrite the key message of the slide into natural spoken language, avoiding dense text or technical jargon. We apply length control to ensure each narration falls within a target duration, typically between 30 and 150 seconds. Once the narration script is obtained, we synthesize the corresponding audio using a text-to-speech system. Each narration audio is paired with its slide and timestamped, forming the basis for synchronized video rendering in the next stage.\n\n\n\nFigure 3: Overview of the PresentAgent framework. Our system takes diverse documents (e.g., papers, websites, PDFs) as input and follows a modular generation pipeline. It first performs outline generation (Step 1) and retrieves the most suitable template (Step 2), then generates slides and narration notes via a vision-language model (Step 3). The notes are converted into audio via TTS and composed into a presentation video (Step 4). To evaluate video quality, we design multiple prompts (Step 5) and feed them into a VLM-based scoring pipeline (Step 6) that outputs dimension-specific metrics.",
+ "medias": []
+ },
+ {
+ "title": "Video Assembly",
+ "content": "In the final stage, we assemble the slide images and narration audio into a coherent, time-aligned presentation video. Each slide frame is displayed for the duration of its corresponding audio segment, with optional transitions between segments. We use video processing libraries such as ffmpeg to compose the visual and audio tracks. Each slide is rendered as a static frame, and the narration is added as synchronized voiceover audio. The output is a fully rendered video file in standard formats such as .mp4, suitable for presentation, sharing, or further editing. This stage completes the transformation from a raw document into a narrated, structured presentation video.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Experiments",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Introduction to Experiments",
+ "content": "We conduct experiments to evaluate the effectiveness of our proposed system in generating highquality, narrated presentation videos. Given the novelty of the task, our focus is not on competing with existing baselines, but rather on assessing the performance of our full system relative to human-created presentations. Comprehension accuracy is determined based on performance in the PresentEval task. Evaluation setup can be found in appendix H.\n\n**Via Fixed Quiz**\n\n...\n\n**Question1 : XXX Question2 : XXX**\n\n**Question5 : XXX**\n\n**1. Content Quality 2. Visual Quality 3. Comprehension Accuracy**",
+ "medias": [
+ {
+ "markdown_content": "| Method | Model | Quiz Accuracy | | Video Score | | | | Audio Score | | |\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n| | | | Content | Visual | Comp. | Mean | Content | Audio | Comp. | Mean |\n| Human | Human | 0.56 | 4.0 | 4.6 | 4.8 | 4.47 | 4.8 | 4.6 | 5.0 | 4.80 |\n| PresentAgent | Claude-3.7-sonnet | 0.64 | 4.0 | 4.0 | 4.0 | 4.00 | 4.2 | 4.6 | 4.8 | 4.53 |\n| PresentAgent | Qwen-VL-Max | 0.52 | 4.2 | 4.8 | 4.4 | 4.47 | 4.6 | 4.2 | 5.0 | 4.60 |\n| PresentAgent | Gemini-2.5-pro | 0.52 | 4.2 | 4.4 | 4.4 | 4.33 | 4.2 | 4.0 | 4.8 | 4.33 |\n| PresentAgent | Gemini-2.5-flash | 0.52 | 4.2 | 5.0 | 3.8 | 4.33 | 4.2 | 4.2 | 4.8 | 4.40 |\n| PresentAgent | GPT-4o-Mini | 0.64 | 4.8 | 4.6 | 4.6 | 4.67 | 4.0 | 4.4 | 4.8 | 4.40 |\n| PresentAgent | GPT-4o | 0.56 | 4.0 | 4.2 | 3.6 | 3.93 | 4.2 | 4.4 | 4.8 | 4.47 |",
+ "near_chunks": [
+ "In terms of subjective quality, human-created presentations still lead with the highest video and audio scores overall. However, several PresentAgent variants show competitive performance.\n\nTable 1 presents evaluation results, covering both factual comprehension (Quiz Accuracy) and preference-based quality scores for video and audio outputs. In terms of quiz accuracy, most PresentAgent variants perform comparably to or better than the human reference (0.56), with Claude-3.7 sonnet (Anthropic, 2024) achieving the highest accuracy at 0.64, suggesting strong alignment between the generated content and the source document. Other models such as Qwen-VL-Max (Bai et al., 2025) and Gemini-2.5-flash (DeepMind, 2024) scored slightly lower (0.52), indicating room for improvement in factual grounding.\n\n",
+ "Table 1: Detailed evaluation results on the 5-document test set. Fact-based evaluation includes accuracy on five fixed quiz questions (Q1–Q5). Preference-based evaluation includes 1–5 scale scores for content fidelity, visual design, and overall clarity. Each Quality Score group has a calculated mean column.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/presentagent/../pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_f5f7.png",
+ "caption": "Table: Comparison of presentation quality between human presenters and PresentAgent variants using different LLMs, showing quiz accuracy and subjective scores for video (content, visual, comprehensibility) and audio (content, audio quality, comprehensibility) components.",
+ "cells": [
+ [
+ "Method",
+ "Model",
+ "Quiz Accuracy",
+ "",
+ "Video Score",
+ "",
+ "",
+ "",
+ "Audio Score",
+ "",
+ ""
+ ],
+ [
+ "",
+ "",
+ "",
+ "Content",
+ "Visual",
+ "Comp.",
+ "Mean",
+ "Content",
+ "Audio",
+ "Comp.",
+ "Mean"
+ ],
+ [
+ "Human",
+ "Human",
+ "0.56",
+ "4.0",
+ "4.6",
+ "4.8",
+ "4.47",
+ "4.8",
+ "4.6",
+ "5.0",
+ "4.80"
+ ],
+ [
+ "PresentAgent",
+ "Claude-3.7-sonnet",
+ "0.64",
+ "4.0",
+ "4.0",
+ "4.0",
+ "4.00",
+ "4.2",
+ "4.6",
+ "4.8",
+ "4.53"
+ ],
+ [
+ "PresentAgent",
+ "Qwen-VL-Max",
+ "0.52",
+ "4.2",
+ "4.8",
+ "4.4",
+ "4.47",
+ "4.6",
+ "4.2",
+ "5.0",
+ "4.60"
+ ],
+ [
+ "PresentAgent",
+ "Gemini-2.5-pro",
+ "0.52",
+ "4.2",
+ "4.4",
+ "4.4",
+ "4.33",
+ "4.2",
+ "4.0",
+ "4.8",
+ "4.33"
+ ],
+ [
+ "PresentAgent",
+ "Gemini-2.5-flash",
+ "0.52",
+ "4.2",
+ "5.0",
+ "3.8",
+ "4.33",
+ "4.2",
+ "4.2",
+ "4.8",
+ "4.40"
+ ],
+ [
+ "PresentAgent",
+ "GPT-4o-Mini",
+ "0.64",
+ "4.8",
+ "4.6",
+ "4.6",
+ "4.67",
+ "4.0",
+ "4.4",
+ "4.8",
+ "4.40"
+ ],
+ [
+ "PresentAgent",
+ "GPT-4o",
+ "0.56",
+ "4.0",
+ "4.2",
+ "3.6",
+ "3.93",
+ "4.2",
+ "4.4",
+ "4.8",
+ "4.47"
+ ]
+ ],
+ "merge_area": null
+ },
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "Table 1: Detailed evaluation results on the 5-document test set. Fact-based evaluation includes accuracy on five fixed quiz questions (Q1–Q5). Preference-based evaluation includes 1–5 scale scores for content fidelity, visual design, and overall clarity. Each Quality Score group has a calculated mean column.\n\n",
+ "Figure 4: PresentAgent Demo. Automatically generates academic-style slides and narrated videos from research papers, streamlining the transformation from written content to engaging visual presentations.\n\nFor example, GPT-4o-Mini (Achiam et al., 2023) achieves top scores in video content and visual appeal (both at or near 4.8), while Claude-3.7 sonnet (Anthropic, 2024) delivers the most balanced audio quality (mean 4.53). Interestingly, Gemini-2.5-flash (DeepMind, 2024) scores highest in visual quality (5.0) but lower in comprehension, reflecting a trade-off between aesthetics and clarity. These results highlight the effectiveness of our modular pipeline and the usefulness of our unified PresentEval framework in capturing diverse aspects of presentation quality.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/presentagent/../pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_5_Figure_2.jpeg",
+ "caption": "Figure: PresentAgent demo showing three sections: Technical Blogs (explaining agent concepts), Slides with captions (demonstrating parallelization and evaluator-optimizer workflows), and Videos (featuring augmented LLM presentations with explanatory text)."
+ }
+ ]
+ },
+ {
+ "title": "Main Results",
+ "content": "Table 1 presents evaluation results, covering both factual comprehension (Quiz Accuracy) and preference-based quality scores for video and audio outputs. In terms of quiz accuracy, most PresentAgent variants perform comparably to or better than the human reference (0.56), with Claude-3.7 sonnet (Anthropic, 2024) achieving the highest accuracy at 0.64, suggesting strong alignment between the generated content and the source document. Other models such as Qwen-VL-Max (Bai et al., 2025) and Gemini-2.5-flash (DeepMind, 2024) scored slightly lower (0.52), indicating room for improvement in factual grounding.\n\nIn terms of subjective quality, human-created presentations still lead with the highest video and audio scores overall. However, several PresentAgent variants show competitive performance.\n\n| Method | Model | Quiz Accuracy | | Video Score | | | | Audio Score | | |\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n| | | | Content | Visual | Comp. | Mean | Content | Audio | Comp. | Mean |\n| Human | Human | 0.56 | 4.0 | 4.6 | 4.8 | 4.47 | 4.8 | 4.6 | 5.0 | 4.80 |\n| PresentAgent | Claude-3.7-sonnet | 0.64 | 4.0 | 4.0 | 4.0 | 4.00 | 4.2 | 4.6 | 4.8 | 4.53 |\n| PresentAgent | Qwen-VL-Max | 0.52 | 4.2 | 4.8 | 4.4 | 4.47 | 4.6 | 4.2 | 5.0 | 4.60 |\n| PresentAgent | Gemini-2.5-pro | 0.52 | 4.2 | 4.4 | 4.4 | 4.33 | 4.2 | 4.0 | 4.8 | 4.33 |\n| PresentAgent | Gemini-2.5-flash | 0.52 | 4.2 | 5.0 | 3.8 | 4.33 | 4.2 | 4.2 | 4.8 | 4.40 |\n| PresentAgent | GPT-4o-Mini | 0.64 | 4.8 | 4.6 | 4.6 | 4.67 | 4.0 | 4.4 | 4.8 | 4.40 |\n| PresentAgent | GPT-4o | 0.56 | 4.0 | 4.2 | 3.6 | 3.93 | 4.2 | 4.4 | 4.8 | 4.47 |\n\nTable 1: Detailed evaluation results on the 5-document test set. Fact-based evaluation includes accuracy on five fixed quiz questions (Q1–Q5). Preference-based evaluation includes 1–5 scale scores for content fidelity, visual design, and overall clarity. Each Quality Score group has a calculated mean column.",
+ "medias": []
+ },
+ {
+ "title": "Analysis",
+ "content": "Figure 4 Presents a full example of a PresentAgentauto-generated presentation video, showing a technical blog turned into a narrated presentation. The system identifies structural segments (e.g., introduction, technical explanations) and generates slides with oral-style captions and synchronized speech, covering topics like \"parallelization workflow\" and \"agent system architecture\" to demonstrate its ability to keep technical accuracy while delivering content clearly and conversationally.\n\n\n\nFigure 4: PresentAgent Demo. Automatically generates academic-style slides and narrated videos from research papers, streamlining the transformation from written content to engaging visual presentations.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Conclusion",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "PresentAgent System Summary",
+ "content": "In conclusion, we presented PresentAgent, a modular system for transforming long-form documents into narrated presentation videos. By addressing the challenges of slide planning, narration synthesis, and synchronized rendering, PresentAgent enables structured, controllable, and reusable multimodal outputs.",
+ "medias": []
+ },
+ {
+ "title": "Evaluation Approach",
+ "content": "To evaluate this novel task, we introduced a diverse benchmark and proposed complementary factual and preference-based metrics.",
+ "medias": []
+ },
+ {
+ "title": "Results and Impact",
+ "content": "Experimental results show that PresentAgent generates coherent, engaging, and informative presentations, approaching human quality. This work lays the groundwork for automated, explainable content generation and opens new directions for research in multimodal communication across education, business, and accessibility.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "References",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Academic Citations",
+ "content": "- Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, and 1 others. 2023. Gpt-4 technical report. *arXiv preprint arXiv:2303.08774*.\n- Rie Kubota Ando and Tong Zhang. 2005. A framework for learning predictive structures from multiple tasks and unlabeled data. *Journal of Machine Learning Research*, 6:1817–1853.\n- Galen Andrew and Jianfeng Gao. 2007. Scalable training of L1-regularized log-linear models. In *Proceedings of the 24th International Conference on Machine Learning*, pages 33–40.\n- Anthropic. 2024. Claude 3 technical overview. https://www.anthropic.com/news/claude-3. Accessed: 2025-06-30.\n- Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, and 1 others. 2025. Qwen2. 5-vl technical report. *arXiv preprint arXiv:2502.13923*.\n- Iz Beltagy, Matthew E Peters, and Arman Cohan. 2020. Longformer: The long-document transformer. *arXiv preprint arXiv:2004.05150*.\n- Jiaao Chen and Diyi Yang. 2021. Structure-aware abstractive conversation summarization via discourse and action graphs. *arXiv preprint arXiv:2104.08400*.\n- Google DeepMind. 2024. Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities. https://deepmind.google/technologies/ gemini/. Accessed: 2025-06-30.\n- Chaorui Deng, Deyao Zhu, Kunchang Li, Chenhui Gou, Feng Li, Zeyu Wang, Shu Zhong, Weihao Yu, Xiaonan Nie, Ziang Song, and 1 others. 2025. Emerging properties in unified multimodal pretraining. *arXiv preprint arXiv:2505.14683*.\n- Tsu-Jui Fu, William Yang Wang, Daniel McDuff, and Yale Song. 2022. Doc2ppt: Automatic presentation slides generation from scientific documents. In *Proceedings of the AAAI Conference on Artificial Intelligence*, volume 36, pages 634–642.\n- Jiaxin Ge, Zora Zhiruo Wang, Xuhui Zhou, Yi-Hao Peng, Sanjay Subramanian, Qinyue Tan, Maarten Sap, Alane Suhr, Daniel Fried, Graham Neubig, and Trevor Darrell. 2025. Autopresent: Designing structured visuals from scratch. *arXiv preprint arXiv:2501.00912*.\n- Yingqing He, Menghan Xia, Haoxin Chen, Xiaodong Cun, Yuan Gong, Jinbo Xing, Yong Zhang, Xintao Wang, Chao Weng, Ying Shan, and 1 others. 2023. Animate-a-story: Storytelling with retrieval-augmented video generation. *arXiv preprint arXiv:2307.06940*.\n- Levon Khachatryan, Andranik Movsisyan, Vahram Tadevosyan, Roberto Henschel, Zhangyang Wang, Shant Navasardyan, and Humphrey Shi. 2023. Text2video-zero: Text-to-image diffusion models are zero-shot video generators. In *Proceedings of the IEEE/CVF International Conference on Computer Vision*, pages 15954–15964.\n- Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. *arXiv preprint arXiv:1910.13461*.\n- Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Peiyuan Zhang, Yanwei Li, Ziwei Liu, and 1 others. 2024. Llavaonevision: Easy visual task transfer. *arXiv preprint arXiv:2408.03326*.\n- Xin Li, Wenqing Chu, Ye Wu, Weihang Yuan, Fanglong Liu, Qi Zhang, Fu Li, Haocheng Feng, Errui Ding, and Jingdong Wang. 2023. Videogen: A reference-guided latent diffusion approach for high definition text-to-video generation. *arXiv preprint arXiv:2309.00398*.\n- Kevin Qinghong Lin, Linjie Li, Difei Gao, Qinchen Wu, Mingyi Yan, Zhengyuan Yang, Lijuan Wang, and Mike Zheng Shou. 2024a. Videogui: A benchmark for gui automation from instructional videos. *arXiv preprint arXiv:2406.10227*.\n- Kevin Qinghong Lin, Linjie Li, Difei Gao, Zhengyuan Yang, Shiwei Wu, Zechen Bai, Weixian Lei, Lijuan Wang, and Mike Zheng Shou. 2024b. Showui: One vision-language-action model for gui visual agent. *arXiv preprint arXiv:2411.17465*.\n- Pan Lu, Bowen Chen, Sheng Liu, Rahul Thapa, Joseph Boen, and James Zou. 2025. Octotools: An agentic framework with extensible tools for complex reasoning. *arXiv preprint arXiv:2502.11271*.\n- Shravan Nayak, Xiangru Jian, Kevin Qinghong Lin, Juan A. Rodriguez, Montek Kalsi, Rabiul Awal, Nicolas Chapados, M. Tamer Özsu, Aishwarya Agrawal, David Vazquez, Christopher Pal, Perouz Taslakian, Spandana Gella, and Sai Rajeswar. 2025. Ui-vision: A desktop-centric gui benchmark for visual perception and interaction. *arXiv preprint arXiv:2503.15661*.\n- Junrui Ni, Liming Wang, Heting Gao, Kaizhi Qian, Yang Zhang, Shiyu Chang, and Mark Hasegawa-Johnson. 2022. Unsupervised text-to-speech synthesis by unsupervised automatic speech recognition. *arXiv preprint arXiv:2203.15796*.\n- Wei Pang, Kevin Qinghong Lin, Xiangru Jian, Xi He, and Philip Torr. 2025. Paper2poster: Towards multimodal poster automation from scientific papers. *arXiv preprint arXiv:2505.21497*.\n- Vadim Popov, Ivan Vovk, Vladimir Gogoryan, Tasnima Sadekova, and Mikhail Kudinov. 2021. Grad-tts: A diffusion probabilistic model for text-to-speech. In *International conference on machine learning*, pages 8599–8608. PMLR.\n- Yujia Qin, Yining Ye, Junjie Fang, Haoming Wang, Shihao Liang, Shizuo Tian, Junda Zhang, Jiahao Li, Yunxin Li, Shijue Huang, and 1 others. 2025. Uitars: Pioneering automated gui interaction with native agents. *arXiv preprint arXiv:2501.12326*.\n- Mohammad Sadegh Rasooli and Joel R. Tetreault. 2015. Yara parser: A fast and accurate dependency parser. *Computing Research Repository*, arXiv:1503.06733. Version 2.\n- Yi Ren, Yangjun Ruan, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2019. Fastspeech: Fast, robust and controllable text to speech. *Advances in neural information processing systems*, 32.\n- Timo Schick, Jane Dwivedi-Yu, Roberto Dessì, and et al. 2023. Toolformer: Language models can teach themselves to use tools. *arXiv preprint arXiv:2302.04761*.\n- Shivam R Solanki and Drupad K Khublani. 2024. From script to screen: Unveiling text-to-video generation. In *Generative Artificial Intelligence: Exploring the Power and Potential of Generative AI*, pages 81–112. Springer.\n- Qiushi Sun, Kanzhi Cheng, Zichen Ding, Chuanyang Jin, Yian Wang, Fangzhi Xu, Zhenyu Wu, Chengyou Jia, Liheng Chen, Zhoumianze Liu, and 1 others. 2024. Os-genesis: Automating gui agent trajectory construction via reverse task synthesis. *arXiv preprint arXiv:2412.19723*.\n- Hideyuki Tachibana, Katsuya Uenoyama, and Shunsuke Aihara. 2018. Efficiently trainable text-to-speech system based on deep convolutional networks with guided attention. In *2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)*, pages 4784–4788. IEEE.\n- Baode Wang, Biao Wu, Weizhen Li, Meng Fang, Yanjie Liang, Zuming Huang, Haozhe Wang, Jun Huang, Ling Chen, Wei Chu, and 1 others. 2025. Infinity parser: Layout aware reinforcement learning for scanned document parsing. *arXiv preprint arXiv:2506.03197*.\n- Guanghua Wang, Priyanshi Garg, and Weili Wu. 2024a. Segmented summarization and refinement: A pipeline for long-document analysis on social media. *Journal of Social Computing*, 5(2):132–144.\n- Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, and 1 others. 2024b. Qwen2 vl: Enhancing vision-language model's perception of the world at any resolution. *arXiv preprint arXiv:2409.12191*.\n- Xingyao Wang, Boxuan Li, Yufan Song, Frank F Xu, Xiangru Tang, Mingchen Zhuge, Jiayi Pan, Yueqi Song, Bowen Li, Jaskirat Singh, and 1 others. 2024c. Opendevin: An open platform for ai software developers as generalist agents. *arXiv preprint arXiv:2407.16741*.\n- Yuan Wang, Di Huang, Yaqi Zhang, Wanli Ouyang, Jile Jiao, Xuetao Feng, Yan Zhou, Pengfei Wan, Shixiang Tang, and Dan Xu. 2024d. Motiongpt-2: A general-purpose motion-language model for motion generation and understanding. *arXiv preprint arXiv:2410.21747*.\n- Biao Wu, Yanda Li, Meng Fang, Zirui Song, Zhiwei Zhang, Yunchao Wei, and Ling Chen. 2024. Foundations and recent trends in multimodal mobile agents: A survey. *arXiv preprint arXiv:2411.02006*.\n- Jinheng Xie, Weijia Mao, Zechen Bai, David Junhao Zhang, Weihao Wang, Kevin Qinghong Lin, Yuchao Gu, Zhijie Chen, Zhenheng Yang, and Mike Zheng Shou. 2024. Show-o: One single transformer to unify multimodal understanding and generation. *arXiv preprint arXiv:2408.12528*.\n- Jin Xu, Zhifang Guo, Jinzheng He, Hangrui Hu, Ting He, Shuai Bai, Keqin Chen, Jialin Wang, Yang Fan, Kai Dang, and 1 others. 2025. Qwen2. 5-omni technical report. *arXiv preprint arXiv:2503.20215*.\n- Qiyao Xue, Xiangyu Yin, Boyuan Yang, and Wei Gao. 2025. Phyt2v: Llm-guided iterative self-refinement for physics-grounded text-to-video generation. In *Proceedings of the Computer Vision and Pattern Recognition Conference*, pages 18826–18836.\n- John Yang, Carlos Jimenez, Alexander Wettig, Kilian Lieret, Shunyu Yao, Karthik Narasimhan, and Ofir Press. 2024a. Swe-agent: Agent-computer interfaces enable automated software engineering. *Advances in Neural Information Processing Systems*, 37:50528– 50652.\n- Ke Yang, Jiateng Liu, John Wu, Chaoqi Yang, Yi R Fung, Sha Li, Zixuan Huang, Xu Cao, Xingyao Wang, Yiquan Wang, and 1 others. 2024b. If llm is the wizard, then code is the wand: A survey on how code empowers large language models to serve as intelligent agents. *arXiv preprint arXiv:2401.00812*.\n- Rui Yang, Lin Song, Yanwei Li, Sijie Zhao, Yixiao Ge, Xiu Li, and Ying Shan. 2023a. Gpt4tools: Teaching large language model to use tools via self-instruction. *Advances in Neural Information Processing Systems*, 36:71995–72007.\n- Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Ehsan Azarnasab, Faisal Ahmed, Zicheng Liu, Ce Liu, Michael Zeng, and Lijuan Wang. 2023b. Mm-react: Prompting chatgpt for multimodal reasoning and action. *arXiv preprint arXiv:2303.11381*.\n- Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, and 1 others. 2024c. Cogvideox: Text-to-video diffusion models with an expert transformer. *arXiv preprint arXiv:2408.06072*.\n- Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik R Narasimhan, and Yuan Cao. 2023. React: Synergizing reasoning and acting in language models. In *The Eleventh International Conference on Learning Representations*.\n- Murong Yue, Wenlin Yao, Haitao Mi, Dian Yu, Ziyu Yao, and Dong Yu. 2024. Dots: Learning to reason dynamically in llms via optimal reasoning trajectories search. *arXiv preprint arXiv:2410.03864*.\n- Zeyu Zhang, Yiran Wang, Biao Wu, Shuo Chen, Zhiyuan Zhang, Shiya Huang, Wenbo Zhang, Meng Fang, Ling Chen, and Yang Zhao. 2024. Motion avatar: Generate human and animal avatars with arbitrary motion. *arXiv preprint arXiv:2405.11286*.\n- Hao Zheng, Xinyan Guan, Hao Kong, Jia Zheng, Weixiang Zhou, Hongyu Lin, Yaojie Lu, Ben He, Xianpei Han, and Le Sun. 2025a. Pptagent: Generating and evaluating presentations beyond text-to-slides. *arXiv preprint arXiv:2501.03936*.\n- Hao Zheng, Xinyan Guan, Hao Kong, Jia Zheng, Weixiang Zhou, Hongyu Lin, Yaojie Lu, Ben He, Xianpei Han, and Le Sun. 2025b. Pptagent: Generating and evaluating presentations beyond text-to-slides. *arXiv preprint arXiv:2501.03936*.\n- Zixiang Zhou, Yu Wan, and Baoyuan Wang. 2024. Avatargpt: All-in-one framework for motion understanding planning generation and beyond. In *Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition*, pages 1357–1366.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Related Work",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Document-to-Multimodal Generation",
+ "content": "Recent advances in large language models (LLMs) and multimodal generation have sparked growing interest in converting documents into diverse output formats, such as slides, posters, or audio summaries (Xu et al., 2025; Wang et al., 2025; Pang et al., 2025; Sun et al., 2024). Systems like PP-TAgent (Zheng et al., 2025b) and Doc2PPT (Fu et al., 2022) treat document-to-slide generation as a structured summarization problem, focusing on layout-aware slide construction. Other works, such as Paper2Poster (Pang et al., 2025) extend this idea by producing single-page visual summaries using layout planning and visual feedback. However, these systems typically generate static outputs and do not model time-dependent delivery such as narration or slide progression. Our work builds upon these foundations, but further introduces temporal planning and audio-visual synchronization, enabling the generation of fully narrated presentation videos.",
+ "medias": []
+ },
+ {
+ "title": "Vision-Language Agents",
+ "content": "Recent advances have highlighted the expanding capabilities of vision language models (VLMs) beyond traditional language understanding. Techniques such as ReAct (Yao et al., 2023; Yang et al., 2023b; Yue et al., 2024) have shown that LLMs can operate as autonomous agents, capable of stepby-step reasoning and dynamic interaction through code execution (Wang et al., 2024c; Yang et al., 2024a,b), API function calls (Schick et al., 2023; Lu et al., 2025; Yang et al., 2023a), user interface manipulation (Lin et al., 2024b; Qin et al., 2025; Nayak et al., 2025; Wu et al., 2024), and motion generation (Zhang et al., 2024; Zhou et al., 2024; Wang et al., 2024d). Despite these developments, general-purpose agents still struggle with professional tasks that demand accuracy, domainspecific knowledge, and reliable interaction (Lin et al., 2024a). A closely related area is slide automation (Ge et al., 2025; Zheng et al., 2025a), which agents translate short text prompts into executable Python code to render presentation slides. In contrast, our proposed presentation video generation task is significantly more challenging: instead of taking a short prompt as input, the system processes an entire long-form document—such as a research paper, product manual, or technical report—and produces a well-structured presentation video with oral-style narration. This task imposes higher demands on content understanding, multimodal alignment, speech generation, and video synthesis. To address these challenges, we design a generation pipeline along with an automatic evaluation framework to systematically assess the generated videos in terms of information delivery, visual quality, and overall comprehensibility.",
+ "medias": []
+ },
+ {
+ "title": "Implementation Details",
+ "content": "PresentAgent is implemented using a modular architecture that integrates LLMs, VLMs, and text-to-speech (TTS) systems. Our primary models include LLMs (GPT-4o, GPT-4o-mini, Claude-3.7-sonnet) and VLMs (Qwen-VL-Max, Gemini-2.5-Flash, Gemini-2.5- Pro). For TTS systems, we choose the MegaTTS3 model for better performance. For visual and multimodal evaluation, we use Qwen-VL-2.5-3B-Instruct as VLM. In our experimental pipeline, any input document is automatically transformed into a Power-Point deck, paired with a generated audio narration, and then composited into a synchronized video presentation.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Implementation Details",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Multimodal Architecture",
+ "content": "PresentAgent adopts a highly modular multimodalgeneration architecture. At the languageunderstanding and generation layer, we run six primary LLM back ends in parallel—GPT-4o, GPT-4o-mini, Qwen-VL-Max, Gemini-2.5-Flash, Gemini-2.5-Pro, and Claude-3.7-Sonnet—and select or ensemble them on-the-fly with a dynamic routing policy that weighs input length, conversational complexity, and latency budget. For visuallanguage evaluation, we introduce the lightweight VLM Qwen-VL-2.5-3B-Instruct to score slide layout, chart readability, and cross-modal consistency, feeding its self-critique back into generation. Speech synthesis is unified on MegaTTS3, which outputs 24 kHz, 16-bit high-fidelity narration and supports prosody-tag controls for fine-grained rate, pitch, and emotion adjustment.",
+ "medias": []
+ },
+ {
+ "title": "Experimental Pipeline",
+ "content": "The experimental pipeline converts any input document—PDF, Markdown, DOCX, or web snapshot through three automated stages:\n\n1. Structured parsing & re-ordering that maps content to a hierarchical topic–subtopic tree.\n\n2. Per-slide generation with the chosen LLM, producing a PowerPoint deck containing titles, bullet points, graphic placeholders, and Alt-Text, while retrieving and inserting relevant images for key nouns.\n\n3. Synchronized narration generation with MegaTTS3 in Chinese or English, followed by an FFmpeg script that assembles a 1080 p video with fade-in/out transitions and optional captions.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "C Discussion",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Current Work Synthesis",
+ "content": "In this work, we synthesized presentation-style videos that integrate visual slides, textual narration, and spoken audio, simulating realistic multimodal communication scenarios. While our current evaluation focuses on the individual quality of each modality—such as visual clarity, textual relevance, and audio intelligibility—these dimensions are treated independently. However, in real-world applications, the effectiveness of communication often hinges on the semantic and temporal coherence across modalities.",
+ "medias": []
+ },
+ {
+ "title": "Future Research Direction",
+ "content": "Future research should thus move beyond isolated assessments and aim toward fusion-aware understanding and evaluation. This entails not only modeling the interactions and alignment among image, audio, and text modalities, but also enabling the system to reason over their combined meaning. Existing models like ImageBind offer a unified embedding space for multiple modalities, but lack the capacity for high-level inference and semantic comprehension.",
+ "medias": []
+ },
+ {
+ "title": "Multimodal Reasoning Integration",
+ "content": "A promising direction lies in bridging representation alignment with multimodal reasoning, by integrating aligned modality encoders with powerful language models. This would allow the system to jointly perceive, interpret, and respond to complex multimodal inputs—such as explaining a visual concept based on both audio narration and visual cues, or identifying inconsistencies across modalities. Developing such reasoning-capable, fusion-aware models will be critical for advancing robust, coherent multimodal understanding in real-world applications.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Limitations",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Computational Cost Constraints",
+ "content": "Our work faces two key constraints: (1) Due to the high computational costs of commercial LLM/VLM APIs (e.g., GPT-4o and Gemini-2.5- Pro), evaluation was limited to five academic papers, potentially underrepresenting the document diversity shown in our benchmark (Figure 5);",
+ "medias": []
+ },
+ {
+ "title": "Static Slide Limitations",
+ "content": "(2) PresentAgent currently generates static slides without dynamic animations/effects due to architectural constraints in video synthesis and trade-offs between generation speed and visual quality, as noted in ChronoMagic-Bench's temporal coherence studies. Future improvements could involve lightweight distillation models and physics-aware rendering engines.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Evaluation Benchmark",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Document Types and Content",
+ "content": "As Shown in Figure 5, we showcase four of the representative document types in our benchmark: academic papers, web pages, technical blogs, and presentation slides. These documents cover a broad spectrum of real-world content domains, such as educational tutorials, research briefs, product manuals, scientific articles, news commentary, and business reports. Each document is paired with a manually authored presentation video, providing a diverse and realistic testbed for evaluating documentto-video generation systems in terms of multimodal coherence, content preservation, and presentation quality.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Doc2Present Dataset Details",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Data Source",
+ "content": "We collect 30 high-quality video samples from public platforms, educational repositories, and professional presentation archives. Each video follows a structured narration format, combining slide-based visuals with synchronized voiceover. We manually align each video with its source document and ensure the following conditions are met: (1) the content structure of the video follows that of the document; (2) the visuals convey document information in a compact, structured form; and (3) the narration and slides are well-aligned temporally.",
+ "medias": []
+ },
+ {
+ "title": "Data Statistics",
+ "content": "The average document length is 3,000–8,000 words, while the corresponding videos range from 1 to 2 minutes and contain 5-10 slides. This setting highlights the core challenge of the task: transforming dense, domain-specific documents into effective and digestible multimodal presentations.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "PresentEval Evaluation Methods",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Objective Quiz Evaluation",
+ "content": "## G.1 Prompts of Objective Quiz Evaluation\n\nTable 2 presents the prompting content for the evaluation method utilizing objective quiz-based assessment. Each set of questions included in this evaluation is crafted manually, with its creation firmly rooted in the actual content of the relevant documents. The formulation of these questions\n\n\n\nFigure 5: Document Diversity in Our Evaluation Benchmark.\n\n| Prensentation of Web Pages | What is the main feature highlighted in the iPhone's promotional webpage? |\n| --- | --- |\n| A. | A more powerful chip for faster performance |\n| B. | A brighter and more vibrant display |\n| C. | An upgraded camera system with better lenses |\n| D. | A longer-lasting and more efficient battery |\n| Prensentation of Academic Paper | What primary research gap did the authors aim to address by introducing the FineGym dataset? |\n| A. | Lack of low-resolution sports footage for compression studies |\n| B. | Need for fine-grained action understanding that goes beyond coarse categories |\n| C. | Absence of synthetic data to replace human annotations |\n| D. | Shortage of benchmarks for background context recognition |\n\nTable 2: Prompt of evaluation via Objective Quiz Evaluation. Each question set is manually created based on the actual document content, with a focus on topic recognition, structural understanding, and key argument identification. These questions evaluate how well the generated video communicates the source material.\n\nplaces a distinct emphasis on three key aspects: topic recognition, which involves the ability to accurately identify and grasp the central themes of the source material; structural understanding, referring to the comprehension of the organizational framework and logical arrangement of the document; and key argument identification, focusing on the capacity to pinpoint the core viewpoints and supporting arguments within the content. These carefully designed questions serve as a means to evaluate the extent to which the generated video successfully conveys the essential information, core ideas, and structural logic of the original source material, thereby assessing the effectiveness of the video in communicating the source content.",
+ "medias": [
+ {
+ "markdown_content": "",
+ "near_chunks": [
+ "Table 2 presents the prompting content for the evaluation method utilizing objective quiz-based assessment. Each set of questions included in this evaluation is crafted manually, with its creation firmly rooted in the actual content of the relevant documents. The formulation of these questions\n\n",
+ "Figure 5: Document Diversity in Our Evaluation Benchmark.\n\nTable 2: Prompt of evaluation via Objective Quiz Evaluation. Each question set is manually created based on the actual document content, with a focus on topic recognition, structural understanding, and key argument identification. These questions evaluate how well the generated video communicates the source material.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/presentagent/../pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/_page_11_Figure_0.jpeg",
+ "caption": "Figure: Document diversity visualization showing four types of input documents: Academic Papers, Web Pages, Technical Blogs, and Slides, with representative examples of each format displayed in a grid layout."
+ },
+ {
+ "markdown_content": "| Prensentation of Web Pages | What is the main feature highlighted in the iPhone's promotional webpage? |\n| --- | --- |\n| A. | A more powerful chip for faster performance |\n| B. | A brighter and more vibrant display |\n| C. | An upgraded camera system with better lenses |\n| D. | A longer-lasting and more efficient battery |\n| Prensentation of Academic Paper | What primary research gap did the authors aim to address by introducing the FineGym dataset? |\n| A. | Lack of low-resolution sports footage for compression studies |\n| B. | Need for fine-grained action understanding that goes beyond coarse categories |\n| C. | Absence of synthetic data to replace human annotations |\n| D. | Shortage of benchmarks for background context recognition |",
+ "near_chunks": [
+ "Figure 5: Document Diversity in Our Evaluation Benchmark.\n\nTable 2 presents the prompting content for the evaluation method utilizing objective quiz-based assessment. Each set of questions included in this evaluation is crafted manually, with its creation firmly rooted in the actual content of the relevant documents. The formulation of these questions\n\n",
+ "Table 2: Prompt of evaluation via Objective Quiz Evaluation. Each question set is manually created based on the actual document content, with a focus on topic recognition, structural understanding, and key argument identification. These questions evaluate how well the generated video communicates the source material.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/presentagent/../pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_21d2.png",
+ "caption": "Table: Sample quiz questions evaluating understanding of web page and academic paper content, with multiple-choice options addressing iPhone features and FineGym dataset research contributions.",
+ "cells": [
+ [
+ "Prensentation of Web Pages",
+ "What is the main feature highlighted in the iPhone's promotional webpage?"
+ ],
+ [
+ "A.",
+ "A more powerful chip for faster performance"
+ ],
+ [
+ "B.",
+ "A brighter and more vibrant display"
+ ],
+ [
+ "C.",
+ "An upgraded camera system with better lenses"
+ ],
+ [
+ "D.",
+ "A longer-lasting and more efficient battery"
+ ],
+ [
+ "Prensentation of Academic Paper",
+ "What primary research gap did the authors aim to address by introducing the FineGym dataset?"
+ ],
+ [
+ "A.",
+ "Lack of low-resolution sports footage for compression studies"
+ ],
+ [
+ "B.",
+ "Need for fine-grained action understanding that goes beyond coarse categories"
+ ],
+ [
+ "C.",
+ "Absence of synthetic data to replace human annotations"
+ ],
+ [
+ "D.",
+ "Shortage of benchmarks for background context recognition"
+ ]
+ ],
+ "merge_area": null
+ }
+ ]
+ },
+ {
+ "title": "Subjective Scoring",
+ "content": "### G.2 Prompts of Subjective Scoring\n\nPrompt of evaluation via subjective scoring is shown in table 3. This table showcases the prompting content employed in the subjective scoringbased evaluation approach. Each individual prompt within this set is precisely targeted at a specific evaluative dimension. These dimensions encompass\n\nnarrative coherence, which pertains to the logical flow and consistency of the storytelling; visual appeal and audio appeal, focusing on the attractiveness and engaging nature of the visual elements and audio components respectively; and comprehension difficulty, referring to the level of ease or challenge in understanding the presented content. These prompts are meticulously designed to serve as a guiding framework for vision-language models, enabling them to assess presentations from a human-centric perspective. This means that the evaluation aligns with human perceptions, preferences, and ways of understanding, ensuring that the assessment results are more in line with how humans would judge the quality of the presentations.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ },
+ {
+ "title": "Evaluation Setup",
+ "summary": "The evaluation setup consists of 30 long-form documents with human-created reference videos spanning diverse topics. Each document is processed through the authors' generation pipeline to create a two-minute presentation video. The evaluation framework, PresentEval, employs a split strategy: Qwen-VL-2.5-3B conducts objective assessment via multiple-choice comprehension questions on entire videos, while Qwen-Omni-7B performs subjective scoring on shorter segments. Evaluation dimensions include narrative coherence, visual/audio appeal, and comprehension difficulty, guided by specific prompts. This approach addresses the current limitation of multimodal models in processing longer videos while maintaining comprehensive assessment across content quality, visual quality, and comprehension accuracy.",
+ "subsections": [
+ {
+ "title": "Test Set Construction",
+ "content": "We construct a test set consisting of 30 long-form documents, each paired with a manually created presentation video that serves as a human-level reference. These documents span a diverse range of topics, including education, product explanation, research overviews, and policy briefings. For each document, we generate a corresponding presentation video using our full generation pipeline.",
+ "medias": [
+ {
+ "markdown_content": "| Video | Scoring Prompt |\n| --- | --- |\n| Narr. Coh. | \"How coherent is the narration across the video? Are the ideas logically connected and easy to follow?\" |\n| Visual Appeal | \"How would you rate the visual design of the slides in terms of layout, aesthetics, and overall quality?\" |\n| Comp. Diff. | \"How easy is it to understand the presentation as a viewer? Were there any confusing or contradictory parts?\" |\n| Audio | Scoring Prompt |\n| Narr. Coh. | \"How coherent is the narration throughout the audio? Are the ideas logically structured and easy to follow?\" |\n| Audio Appeal | \"How pleasant and engaging is the narrator's voice in terms of tone, pacing, and delivery?\" |\n| Comp. Diff. | \"How easy is it to understand the spoken content? Were there any unclear or confusing parts in the audio?\" |",
+ "near_chunks": [
+ "We construct a test set consisting of 30 long-form documents, each paired with a manually created presentation video that serves as a human-level reference. These documents span a diverse range of topics, including education, product explanation,\n\n# H Evaluation Setup\n\n",
+ "Table 3: Prompt of evaluation via Subjective Scoring. Each prompt targets a specific dimension—narrative coherence, visual/audio appeal, or comprehension difficulty—and is designed to guide vision-language models in assessing presentations from a human-centric perspective. Abbreviations: Narr. Coh. = Narrative Coherence; Comp. Diff. = Comprehension Difficulty.\n\n"
+ ],
+ "path": "/Users/shijingwei/Desktop/PresentAgent/presentagent/../pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_efca.png",
+ "caption": "Table: Subjective Scoring Prompts for Video and Audio Evaluation. The table outlines specific questions used to assess narrative coherence, visual/audio appeal, and comprehension difficulty when evaluating presentation quality from both visual and audio perspectives.",
+ "cells": [
+ [
+ "Video",
+ "Scoring Prompt"
+ ],
+ [
+ "Narr. Coh.",
+ "\"How coherent is the narration across the video? Are the ideas logically connected and easy to follow?\""
+ ],
+ [
+ "Visual Appeal",
+ "\"How would you rate the visual design of the slides in terms of layout, aesthetics, and overall quality?\""
+ ],
+ [
+ "Comp. Diff.",
+ "\"How easy is it to understand the presentation as a viewer? Were there any confusing or contradictory parts?\""
+ ],
+ [
+ "Audio",
+ "Scoring Prompt"
+ ],
+ [
+ "Narr. Coh.",
+ "\"How coherent is the narration throughout the audio? Are the ideas logically structured and easy to follow?\""
+ ],
+ [
+ "Audio Appeal",
+ "\"How pleasant and engaging is the narrator's voice in terms of tone, pacing, and delivery?\""
+ ],
+ [
+ "Comp. Diff.",
+ "\"How easy is it to understand the spoken content? Were there any unclear or confusing parts in the audio?\""
+ ]
+ ],
+ "merge_area": null
+ }
+ ]
+ },
+ {
+ "title": "Evaluation Framework",
+ "content": "All videos, both human-created and machinegenerated, are evaluated using our unified evaluation framework, PresentEval. Each synthesized video is approximately two minutes in length. However, due to the current lack of a single multimodal model capable of jointly assessing visual and audio quality for videos longer than two minutes, we adopt a split evaluation strategy.",
+ "medias": []
+ },
+ {
+ "title": "Evaluation Stages",
+ "content": "In the Objective Quiz stage, we use Qwen-VL-2.5-3B (Wang et al., 2024b) to evaluate the accuracy of the entire video using a fixed set of multiplechoice comprehension questions. In the Subjective Scoring stage, we extract short video/audio segments and evaluate them individually to assess quality in a more focused and scalable manner, using Qwen-Omni-7B (Xu et al., 2025).\n\nBoth models are guided by dimension-specific prompts and score each video or audio sample along three axes: Content Quality, Visual Quality, and Comprehension Accuracy.",
+ "medias": []
+ },
+ {
+ "title": "Scoring Prompts",
+ "content": "| Video | Scoring Prompt |\n| --- | --- |\n| Narr. Coh. | \"How coherent is the narration across the video? Are the ideas logically connected and easy to follow?\" |\n| Visual Appeal | \"How would you rate the visual design of the slides in terms of layout, aesthetics, and overall quality?\" |\n| Comp. Diff. | \"How easy is it to understand the presentation as a viewer? Were there any confusing or contradictory parts?\" |\n| Audio | Scoring Prompt |\n| Narr. Coh. | \"How coherent is the narration throughout the audio? Are the ideas logically structured and easy to follow?\" |\n| Audio Appeal | \"How pleasant and engaging is the narrator's voice in terms of tone, pacing, and delivery?\" |\n| Comp. Diff. | \"How easy is it to understand the spoken content? Were there any unclear or confusing parts in the audio?\" |\n\nTable 3: Prompt of evaluation via Subjective Scoring. Each prompt targets a specific dimension—narrative coherence, visual/audio appeal, or comprehension difficulty—and is designed to guide vision-language models in assessing presentations from a human-centric perspective. Abbreviations: Narr. Coh. = Narrative Coherence; Comp. Diff. = Comprehension Difficulty.",
+ "medias": []
+ }
+ ],
+ "markdown_content": null
+ }
+ ],
+ "metadata": {
+ "title": "PresentAgent: Multimodal Agent for Presentation Video Generation",
+ "authors": "Jingwei Shi, Zeyu Zhang, Biao Wu, Yanjie Liang, Meng Fang, Ling Chen, Yang Zhao",
+ "affiliations": "AI Geeks, Australia; Australian Artificial Intelligence Institute, Australia; University of Liverpool, United Kingdom; La Trobe University, Australia",
+ "corresponding_author": "y.zhao2@latrobe.edu.au",
+ "presentation-date": "2025-07-05"
+ }
+}
\ No newline at end of file
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/source.md b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/source.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ab2bcc12c1399062ec608b51d4232ef07664349
--- /dev/null
+++ b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/source.md
@@ -0,0 +1,332 @@
+# PresentAgent: Multimodal Agent for Presentation Video Generation
+
+Jingwei Shi1∗ Zeyu Zhang1∗† Biao Wu2∗ Yanjie Liang1∗
+
+Meng Fang3 Ling Chen2 Yang Zhao4‡
+
+1AI Geeks, Australia
+
+2Australian Artificial Intelligence Institute, Australia 3University of Liverpool, United Kingdom 4La Trobe University, Australia
+
+∗Equal contribution. † Project lead. ‡Corresponding author: y.zhao2@latrobe.edu.au.
+
+#### Abstract
+
+We present PresentAgent, a multimodal agent that transforms long-form documents into narrated presentation videos. While existing approaches are limited to generating static slides or text summaries, our method advances beyond these limitations by producing fully synchronized visual and spoken content that closely mimics human-style presentations. To achieve this integration, PresentAgent employs a modular pipeline that systematically segments the input document, plans and renders slide-style visual frames, generates contextual spoken narration with large language models and Text-to-Speech models, and seamlessly composes the final video with precise audiovisual alignment. Given the complexity of evaluating such multimodal outputs, we introduce PresentEval, a unified assessment framework powered by Vision-Language Models that comprehensively scores videos across three critical dimensions: content fidelity, visual clarity, and audience comprehension through prompt-based evaluation. Our experimental validation on a curated dataset of 30 document–presentation pairs demonstrates that PresentAgent approaches human-level quality across all evaluation metrics. These results highlight the significant potential of controllable multimodal agents in transforming static textual materials into dynamic, effective, and accessible presentation formats. Code will be available at https://github.com/ AIGeeksGroup/PresentAgent.
+
+### 1 Introduction
+
+Presentations are a widely used and effective medium for conveying complex ideas. By combining visual elements, structured narration, and spoken explanations, they enable information to unfold progressively and be more easily understood by diverse audiences (Fu et al., 2022). Despite their proven effectiveness, creating high-quality presentation videos from long-form documents—such as
+
+
+
+Figure 1: Overview of PresentAgent. It takes documents (e.g., web pages) as input and follows a generation pipeline: (1) document processing, (2) structured slide generation, (3) synchronized caption creation, and (4) audio synthesis. The final output is a presentation video combining visual slides with aligned narration. The purple-highlighted middle results emphasize the system's key transitional outputs during generation.
+
+business reports, technical manuals, policy briefs, or academic papers—typically requires considerable manual effort (Li et al., 2023). This process involves identifying key content, designing slide layouts, writing scripts, recording narration, and aligning all elements into a coherent multimodal output.
+
+Although recent advancements in AI have enabled progress in related areas such as documentto-slide generation (Fu et al., 2022; Zheng et al., 2025a; Pang et al., 2025; Zhang et al., 2024) and text-to-video synthesis (Yang et al., 2024c; Li et al., 2023; Xue et al., 2025; Khachatryan et al., 2023; He et al., 2023; Solanki and Khublani, 2024), a critical gap remains: these methods either produce static visual summaries or generic video clips without structured narration, limiting their effectiveness for structured communication tasks like presentations.
+
+To bridge this gap, we introduce the task of Document-to-Presentation Video Generation, which aims to automatically convert a structured or unstructured document into a narrated video presentation composed of synchronized slides and speech. This task presents unique challenges as it goes beyond traditional summarization (Lewis et al., 2019; Beltagy et al., 2020; Chen and Yang, 2021; Wang
+
+et al., 2024a) or text-to-speech (Tachibana et al., 2018; Ren et al., 2019; Popov et al., 2021; Ni et al., 2022) pipelines by requiring selective content abstraction, layout-aware planning (Wang et al., 2025), and precise multimodal alignment (Li et al., 2024) between visuals and narration. In contrast to prior work that focuses on either static slide and image generation (Zheng et al., 2025a; Deng et al., 2025; Xie et al., 2024) or audio summarization in isolation, our objective is to produce a fully integrated, viewer-ready video experience that closely mimics how human presenters deliver information in real-world scenarios.
+
+To tackle these challenges, we propose a modular generation framework named PresentAgent. Given an input document, the system first segments it into semantic blocks through outline planning, then generates layout-guided slide visuals for each block and rewrites the key message into oral-style narration. Subsequently, these are then synthesized into audio and combined with the slide visuals to produce a time-aligned presentation video. Importantly, our pipeline is designed to be domainadaptable and controllable, enabling broad applicability across document types and presentation styles.
+
+Recognizing the need for rigorous evaluation of such complex multimodal outputs, we curate a test set of 30 human-authored document-video pairs spanning diverse domains, including education, finance, policy, and scientific communication. To comprehensively assess system performance, we further introduce a two-path evaluation strategy that combines fact-based comprehension assessment (via fixed multiple-choice quizzes) and preference-based scoring using vision-language models. This dual-pronged approach captures both objective correctness and subjective quality in video delivery.
+
+Experiment results demonstrate that our method produces fluent, well-structured, and informative presentation videos, approaching human-level performance in both content delivery and viewer comprehension. These findings highlight the potential of combining language models, layout generation, and multimodal synthesis for creating explainable and scalable presentation systems from raw documents.
+
+In general, our contributions are summarized as follows:
+
+- We formulate and address the novel task
+of document-to-presentation video generation, which aims to produce narrated, slide-structured videos from long-form documents across diverse domains.
+
+- We propose PresentAgent, a modular generation framework that integrates document parsing, layout-aware slide composition, narration planning, and audio-visual synchronization, enabling controllable and interpretable generation.
+- We introduce PresentEval, a multi-dimensional evaluation framework powered by Vision-Language Models (VLMs), which scores videos along content, visual, and comprehension dimensions via prompt-based judging.
+- We create a test set of 30 real-world document–presentation pairs and demonstrate through experiments and ablations that PresentAgent approaches human-level performance and significantly outperforms competitive variants.
+
+#### 2 Presentation Benchmark
+
+The benchmark supports evaluation not only of fluency and fidelity, but also of downstream comprehension. Following the methodology introduced in Paper2Poster (Pang et al., 2025), we construct a quiz-style evaluation protocol (§5), where vision-language models are asked to answer factual content questions using only the generated video (slides + narration), simulating an audience's understanding. Human-authored videos are used as reference standards for both score calibration and upperbound comparison. As shown in Figure 5, our benchmark encompasses four representative document types (academic papers, web pages, technical blogs, and slides) paired with human-authored videos, covering diverse real-world domains like education, research, and business reports.
+
+We adopt a unified, model-based evaluation framework to assess the generated presentation videos. All evaluations are conducted using a vision-language model, guided by dimensionspecific prompts tailored to different assessment objectives. The framework consists of two complementary components: (1) objective quiz evaluation, which measures factual accuracy through multiplechoice question answering; and (2) subjective scoring, which rates Content Quality, Visual or Audio Quality, and Comprehension Clarity on a 1–5 scale. Together, these metrics provide a comprehensive assessment of both the quality and informativeness
+
+
+
+Figure 2: Overview of our framework. Our approach addresses the full pipeline of document-to-presentation video generation and evaluation. Left: Given diverse input documents—including papers, websites, blogs, slides, and PDFs—PresentAgent generates narrated presentation videos by producing synchronized slide decks with audio. Right: To evaluate these videos, we introduce PresentEval, a two-part evaluation framework: (1) Objective Quiz Evaluation (top), which measures factual comprehension using Qwen-VL; and (2) Subjective Scoring (bottom), which uses vision-language models to rate content quality, visual design, and audio comprehension across predefined dimensions.
+
+of the generated videos.
+
+#### 2.1 Doc2Present Dataset
+
+To support the evaluation of document to presentation video generation, we curate the Doc2Present Benchmark, a diverse dataset of document–presentation video pairs spanning multiple domains. Unlike prior benchmarks focused on research abstracts or slide generation, our dataset includes documents such as business reports, product manuals, policy briefs, and instructional texts, each paired with a human-crafted presentation video.We collect 30 high-quality video samples from public platforms, educational repositories, and professional presentation archives, further details regarding the data sources and statistical information of the dataset can be found in the appendix F.
+
+#### 2.2 PresentEval
+
+To assess the quality of generated presentation videos, we adopt two complementary evaluation strategies: Objective Quiz Evaluation and Subjective Scoring. For each video, we provide the visionlanguage model with the complete set of slide images and the full narration transcript as a unified input—simulating how a real viewer would experience the presentation. In Objective Quiz Evaluation, the model answers a fixed set of factual questions to determine whether the video accurately conveys the key information from the source content. In Subjective Scoring, the model evaluates the video along three dimensions: the coherence of the narration, the clarity and design of the visuals, and the overall ease of understanding. All evaluations are conducted without ground-truth references and
+
+rely entirely on the model's interpretation of the presented content.
+
+Objective Quiz Evaluation To evaluate whether a generated presentation video effectively conveys the core content of its source document, we use a fixed-question comprehension evaluation protocol. Specifically, we manually design five multiplechoice questions for each document, tailored to its content. These questions focus on key aspects such as topic recognition, structural understanding, and main argument extraction. As shown in Table 2, during evaluation, a vision-language model is given the video, including both visual frames and audio transcript, and asked to answer the five questions. Each question has four options, with one correct answer, annotated based on a human-created reference video. The final comprehension score (ranging from 0 to 5) reflects how many questions the model answered correctly, serving as a direct measure of how well the video communicates the original document.
+
+Subjective Scoring To evaluate the quality of generated presentation videos, we adopt a promptbased assessment using vision-language models. Instead of relying on human references or fixed metrics, we ask the model to evaluate each video from a viewer's perspective, using its own reasoning and preferences. The evaluation focuses on three aspects: coherence of narration, clarity and aesthetics of visuals, and overall ease of understanding. The model is shown the video and audio, and gives a score (1–5) with a brief explanation for each aspect. This enables scalable, consistent,
+
+and human-aligned evaluation without manual references. As shown in Table 3, we design different prompts for different modalities and tasks to ensure targeted and effective assessment.
+
+#### 3 PresentAgent
+
+To convert a long-form document into a narrated presentation video, we design a multi-stage generation framework that mirrors how human presenters prepare slides and talk tracks. Our method proceeds in four steps: segmenting the document into semantic units, composing slides with layout-aware structures, generating oral-style narration for each slide and assembling the visual and audio components into a synchronized video. This modular design supports controllability, interpretability, and multimodal alignment, enabling both high-quality generation and fine-grained evaluation. The following sections describe each component in detail.
+
+#### 3.1 Problem Formulation
+
+Our method is designed to transform a long-form document into a structured presentation video through a multi-stage generation pipeline. We provide a formal description to highlight the key difference between our approach and conventional slide-based methods.
+
+Conventional approaches often focus on generating slide elements S directly from a document chunk C, as in Equation 1, where each element includes text or image content, layout attributes, and visual style:
+
+$$S=\{e_{1},e_{2},...,e_{n}\}=f(C)\qquad\quad(1)$$
+
+In contrast, we treat the entire document D as a globally structured input and generate a presentation in three steps: (1) a sequence of semantic segments {C1, ..., CK} via outline planning, (2) a set of slides {S1, ..., SK}, each paired with a narrated audio track Tk generated by first producing a slide-specific script and then converting it to speech, and (3) a video V composed of visual and audio content aligned over time. This is defined as:
+
+$V=$ **Compose($\{(S_{1},T_{1}),...,(S_{K},T_{K})\})=g(D)$**
+
+Rather than editing predefined templates or layouts, our system first identifies high-level structure in the document and then generates slide visuals and narration from scratch. This pipeline
+
+supports controllability, modular evaluation, and multimodal alignment for downstream comprehension and quality assessment.
+
+#### 3.2 Slide Planning and Composition
+
+Our slide generation module is inspired by the editing-based paradigm proposed in PPTAgent (Zheng et al., 2025b), which formulates presentation construction as a structured editing process over HTML-like layouts. While PPTAgent focuses on producing editable .pptx slides, our goal is to generate visually coherent, narrationready slide frames for downstream video synthesis. We re-implement the core idea in a self-contained pipeline tailored to multimodal synchronization.
+
+We begin by segmenting the input document into coherent content blocks using a lightweight LLM-based parser. Each block is assigned a corresponding slide type such as bullet slide, figuredescription, or title-intro, and matched with a predefined layout schema encoded in HTML. Unlike retrieval-based template matching, our system uses semantic and structural cues to map content to layout patterns in a rule-guided manner.
+
+To populate the slide, we define a set of editable operations such as replace_text, insert_image, and add_list, which are applied to the layout structure. These instructions are generated by prompting a language model with the content block and layout constraints. Slides are then rendered into static visual frames using python-pptx or HTML-based renderers.
+
+#### 3.3 Narration and Audio Synthesis
+
+To transform the static slides into an engaging presentation, we generate a spoken narration for each slide and synthesize it into audio. The process involves two components: narration script generation and text-to-speech synthesis.
+
+For each content block corresponding to a slide, we prompt a language model to generate a concise, oral-style narration. The model is instructed to rewrite the key message of the slide into natural spoken language, avoiding dense text or technical jargon. We apply length control to ensure each narration falls within a target duration, typically between 30 and 150 seconds. Once the narration script is obtained, we synthesize the corresponding audio using a text-to-speech system. Each narration audio is paired with its slide and timestamped, forming the basis for synchronized video rendering in the next stage.
+
+
+
+Figure 3: Overview of the PresentAgent framework. Our system takes diverse documents (e.g., papers, websites, PDFs) as input and follows a modular generation pipeline. It first performs outline generation (Step 1) and retrieves the most suitable template (Step 2), then generates slides and narration notes via a vision-language model (Step 3). The notes are converted into audio via TTS and composed into a presentation video (Step 4). To evaluate video quality, we design multiple prompts (Step 5) and feed them into a VLM-based scoring pipeline (Step 6) that outputs dimension-specific metrics.
+
+#### 3.4 Video Assembly
+
+In the final stage, we assemble the slide images and narration audio into a coherent, time-aligned presentation video. Each slide frame is displayed for the duration of its corresponding audio segment, with optional transitions between segments. We use video processing libraries such as ffmpeg to compose the visual and audio tracks. Each slide is rendered as a static frame, and the narration is added as synchronized voiceover audio. The output is a fully rendered video file in standard formats such as .mp4, suitable for presentation, sharing, or further editing. This stage completes the transformation from a raw document into a narrated, structured presentation video.
+
+### 4 Experiments
+
+We conduct experiments to evaluate the effectiveness of our proposed system in generating highquality, narrated presentation videos. Given the novelty of the task, our focus is not on competing with existing baselines, but rather on assessing the performance of our full system relative to human-
+
+created presentations. Comprehension accuracy is determined based on performance in the PresentEval task. Evaluation setup can be found in appendix H.
+
+**Via Fixed Quiz**
+
+...
+
+**Question1 : XXX Question2 : XXX**
+
+**Question5 : XXX**
+
+**1. Content Quality 2. Visual Quality 3. Comprehension Accuracy**
+
+#### 4.1 Main Results
+
+Table 1 presents evaluation results, covering both factual comprehension (Quiz Accuracy) and preference-based quality scores for video and audio outputs. In terms of quiz accuracy, most PresentAgent variants perform comparably to or better than the human reference (0.56), with Claude-3.7 sonnet (Anthropic, 2024) achieving the highest accuracy at 0.64, suggesting strong alignment between the generated content and the source document. Other models such as Qwen-VL-Max (Bai et al., 2025) and Gemini-2.5-flash (DeepMind, 2024) scored slightly lower (0.52), indicating room for improvement in factual grounding.
+
+In terms of subjective quality, human-created presentations still lead with the highest video and audio scores overall. However, several PresentAgent variants show competitive performance.
+
+| Method | Model | Quiz Accuracy | | Video Score | | | | Audio Score | | |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| | | | Content | Visual | Comp. | Mean | Content | Audio | Comp. | Mean |
+| Human | Human | 0.56 | 4.0 | 4.6 | 4.8 | 4.47 | 4.8 | 4.6 | 5.0 | 4.80 |
+| PresentAgent | Claude-3.7-sonnet | 0.64 | 4.0 | 4.0 | 4.0 | 4.00 | 4.2 | 4.6 | 4.8 | 4.53 |
+| PresentAgent | Qwen-VL-Max | 0.52 | 4.2 | 4.8 | 4.4 | 4.47 | 4.6 | 4.2 | 5.0 | 4.60 |
+| PresentAgent | Gemini-2.5-pro | 0.52 | 4.2 | 4.4 | 4.4 | 4.33 | 4.2 | 4.0 | 4.8 | 4.33 |
+| PresentAgent | Gemini-2.5-flash | 0.52 | 4.2 | 5.0 | 3.8 | 4.33 | 4.2 | 4.2 | 4.8 | 4.40 |
+| PresentAgent | GPT-4o-Mini | 0.64 | 4.8 | 4.6 | 4.6 | 4.67 | 4.0 | 4.4 | 4.8 | 4.40 |
+| PresentAgent | GPT-4o | 0.56 | 4.0 | 4.2 | 3.6 | 3.93 | 4.2 | 4.4 | 4.8 | 4.47 |
+
+Table 1: Detailed evaluation results on the 5-document test set. Fact-based evaluation includes accuracy on five fixed quiz questions (Q1–Q5). Preference-based evaluation includes 1–5 scale scores for content fidelity, visual design, and overall clarity. Each Quality Score group has a calculated mean column.
+
+
+
+Figure 4: PresentAgent Demo. Automatically generates academic-style slides and narrated videos from research papers, streamlining the transformation from written content to engaging visual presentations.
+
+For example, GPT-4o-Mini (Achiam et al., 2023) achieves top scores in video content and visual appeal (both at or near 4.8), while Claude-3.7 sonnet (Anthropic, 2024) delivers the most balanced audio quality (mean 4.53). Interestingly, Gemini-2.5-flash (DeepMind, 2024) scores highest in visual quality (5.0) but lower in comprehension, reflecting a trade-off between aesthetics and clarity. These results highlight the effectiveness of our modular pipeline and the usefulness of our unified PresentEval framework in capturing diverse aspects of presentation quality.
+
+#### 4.2 Analysis
+
+Figure 4 Presents a full example of a PresentAgentauto-generated presentation video, showing a technical blog turned into a narrated presentation. The system identifies structural segments (e.g., introduction, technical explanations) and generates slides with oral-style captions and synchronized speech, covering topics like "parallelization workflow" and "agent system architecture" to demonstrate its ability to keep technical accuracy while delivering content clearly and conversationally.
+
+### 5 Conclusion
+
+In conclusion, we presented PresentAgent, a modular system for transforming long-form documents into narrated presentation videos. By addressing the challenges of slide planning, narration synthesis, and synchronized rendering, PresentAgent enables structured, controllable, and reusable multimodal outputs. To evaluate this novel task, we introduced a diverse benchmark and proposed complementary factual and preference-based metrics. Experimental results show that PresentAgent generates coherent, engaging, and informative presentations, approaching human quality. This work lays the groundwork for automated, explainable content generation and opens new directions for research in multimodal communication across education, business, and accessibility.
+
+### References
+
+- Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, and 1 others. 2023. Gpt-4 technical report. *arXiv preprint arXiv:2303.08774*.
+- Rie Kubota Ando and Tong Zhang. 2005. A framework for learning predictive structures from multiple tasks and unlabeled data. *Journal of Machine Learning Research*, 6:1817–1853.
+- Galen Andrew and Jianfeng Gao. 2007. Scalable training of L1-regularized log-linear models. In *Proceedings of the 24th International Conference on Machine Learning*, pages 33–40.
+- Anthropic. 2024. Claude 3 technical overview. https://www.anthropic.com/news/claude-3. Accessed: 2025-06-30.
+- Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, and 1 others. 2025. Qwen2. 5-vl technical report. *arXiv preprint arXiv:2502.13923*.
+- Iz Beltagy, Matthew E Peters, and Arman Cohan. 2020. Longformer: The long-document transformer. *arXiv preprint arXiv:2004.05150*.
+- Jiaao Chen and Diyi Yang. 2021. Structure-aware abstractive conversation summarization via discourse and action graphs. *arXiv preprint arXiv:2104.08400*.
+- Google DeepMind. 2024. Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities. https://deepmind.google/technologies/ gemini/. Accessed: 2025-06-30.
+- Chaorui Deng, Deyao Zhu, Kunchang Li, Chenhui Gou, Feng Li, Zeyu Wang, Shu Zhong, Weihao Yu, Xiaonan Nie, Ziang Song, and 1 others. 2025. Emerging properties in unified multimodal pretraining. *arXiv preprint arXiv:2505.14683*.
+- Tsu-Jui Fu, William Yang Wang, Daniel McDuff, and Yale Song. 2022. Doc2ppt: Automatic presentation slides generation from scientific documents. In *Proceedings of the AAAI Conference on Artificial Intelligence*, volume 36, pages 634–642.
+- Jiaxin Ge, Zora Zhiruo Wang, Xuhui Zhou, Yi-Hao Peng, Sanjay Subramanian, Qinyue Tan, Maarten Sap, Alane Suhr, Daniel Fried, Graham Neubig, and Trevor Darrell. 2025. Autopresent: Designing structured visuals from scratch. *arXiv preprint arXiv:2501.00912*.
+- Yingqing He, Menghan Xia, Haoxin Chen, Xiaodong Cun, Yuan Gong, Jinbo Xing, Yong Zhang, Xintao Wang, Chao Weng, Ying Shan, and 1 others. 2023. Animate-a-story: Storytelling with retrieval-augmented video generation. *arXiv preprint arXiv:2307.06940*.
+- Levon Khachatryan, Andranik Movsisyan, Vahram Tadevosyan, Roberto Henschel, Zhangyang Wang, Shant Navasardyan, and Humphrey Shi. 2023. Text2video-zero: Text-to-image diffusion models are zero-shot video generators. In *Proceedings of the IEEE/CVF International Conference on Computer Vision*, pages 15954–15964.
+- Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. *arXiv preprint arXiv:1910.13461*.
+- Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Peiyuan Zhang, Yanwei Li, Ziwei Liu, and 1 others. 2024. Llavaonevision: Easy visual task transfer. *arXiv preprint arXiv:2408.03326*.
+- Xin Li, Wenqing Chu, Ye Wu, Weihang Yuan, Fanglong Liu, Qi Zhang, Fu Li, Haocheng Feng, Errui Ding, and Jingdong Wang. 2023. Videogen: A reference-guided latent diffusion approach for high definition text-to-video generation. *arXiv preprint arXiv:2309.00398*.
+- Kevin Qinghong Lin, Linjie Li, Difei Gao, Qinchen Wu, Mingyi Yan, Zhengyuan Yang, Lijuan Wang, and Mike Zheng Shou. 2024a. Videogui: A benchmark for gui automation from instructional videos. *arXiv preprint arXiv:2406.10227*.
+- Kevin Qinghong Lin, Linjie Li, Difei Gao, Zhengyuan Yang, Shiwei Wu, Zechen Bai, Weixian Lei, Lijuan Wang, and Mike Zheng Shou. 2024b. Showui: One vision-language-action model for gui visual agent. *arXiv preprint arXiv:2411.17465*.
+- Pan Lu, Bowen Chen, Sheng Liu, Rahul Thapa, Joseph Boen, and James Zou. 2025. Octotools: An agentic framework with extensible tools for complex reasoning. *arXiv preprint arXiv:2502.11271*.
+- Shravan Nayak, Xiangru Jian, Kevin Qinghong Lin, Juan A. Rodriguez, Montek Kalsi, Rabiul Awal, Nicolas Chapados, M. Tamer Özsu, Aishwarya Agrawal, David Vazquez, Christopher Pal, Perouz Taslakian, Spandana Gella, and Sai Rajeswar. 2025. Ui-vision: A desktop-centric gui benchmark for visual perception and interaction. *arXiv preprint arXiv:2503.15661*.
+- Junrui Ni, Liming Wang, Heting Gao, Kaizhi Qian, Yang Zhang, Shiyu Chang, and Mark Hasegawa-Johnson. 2022. Unsupervised text-to-speech synthesis by unsupervised automatic speech recognition. *arXiv preprint arXiv:2203.15796*.
+- Wei Pang, Kevin Qinghong Lin, Xiangru Jian, Xi He, and Philip Torr. 2025. Paper2poster: Towards multimodal poster automation from scientific papers. *arXiv preprint arXiv:2505.21497*.
+- Vadim Popov, Ivan Vovk, Vladimir Gogoryan, Tasnima Sadekova, and Mikhail Kudinov. 2021. Grad-tts: A diffusion probabilistic model for text-to-speech. In *International conference on machine learning*, pages 8599–8608. PMLR.
+- Yujia Qin, Yining Ye, Junjie Fang, Haoming Wang, Shihao Liang, Shizuo Tian, Junda Zhang, Jiahao Li, Yunxin Li, Shijue Huang, and 1 others. 2025. Uitars: Pioneering automated gui interaction with native agents. *arXiv preprint arXiv:2501.12326*.
+- Mohammad Sadegh Rasooli and Joel R. Tetreault. 2015. Yara parser: A fast and accurate dependency parser. *Computing Research Repository*, arXiv:1503.06733. Version 2.
+- Yi Ren, Yangjun Ruan, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2019. Fastspeech: Fast, robust and controllable text to speech. *Advances in neural information processing systems*, 32.
+- Timo Schick, Jane Dwivedi-Yu, Roberto Dessì, and et al. 2023. Toolformer: Language models can teach themselves to use tools. *arXiv preprint arXiv:2302.04761*.
+- Shivam R Solanki and Drupad K Khublani. 2024. From script to screen: Unveiling text-to-video generation. In *Generative Artificial Intelligence: Exploring the Power and Potential of Generative AI*, pages 81–112. Springer.
+- Qiushi Sun, Kanzhi Cheng, Zichen Ding, Chuanyang Jin, Yian Wang, Fangzhi Xu, Zhenyu Wu, Chengyou Jia, Liheng Chen, Zhoumianze Liu, and 1 others. 2024. Os-genesis: Automating gui agent trajectory construction via reverse task synthesis. *arXiv preprint arXiv:2412.19723*.
+- Hideyuki Tachibana, Katsuya Uenoyama, and Shunsuke Aihara. 2018. Efficiently trainable text-to-speech system based on deep convolutional networks with guided attention. In *2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)*, pages 4784–4788. IEEE.
+- Baode Wang, Biao Wu, Weizhen Li, Meng Fang, Yanjie Liang, Zuming Huang, Haozhe Wang, Jun Huang, Ling Chen, Wei Chu, and 1 others. 2025. Infinity parser: Layout aware reinforcement learning for scanned document parsing. *arXiv preprint arXiv:2506.03197*.
+- Guanghua Wang, Priyanshi Garg, and Weili Wu. 2024a. Segmented summarization and refinement: A pipeline for long-document analysis on social media. *Journal of Social Computing*, 5(2):132–144.
+- Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, and 1 others. 2024b. Qwen2 vl: Enhancing vision-language model's perception of the world at any resolution. *arXiv preprint arXiv:2409.12191*.
+- Xingyao Wang, Boxuan Li, Yufan Song, Frank F Xu, Xiangru Tang, Mingchen Zhuge, Jiayi Pan, Yueqi Song, Bowen Li, Jaskirat Singh, and 1 others. 2024c. Opendevin: An open platform for ai software developers as generalist agents. *arXiv preprint arXiv:2407.16741*.
+- Yuan Wang, Di Huang, Yaqi Zhang, Wanli Ouyang, Jile Jiao, Xuetao Feng, Yan Zhou, Pengfei Wan, Shixiang Tang, and Dan Xu. 2024d. Motiongpt-2: A general-purpose motion-language model for motion generation and understanding. *arXiv preprint arXiv:2410.21747*.
+- Biao Wu, Yanda Li, Meng Fang, Zirui Song, Zhiwei Zhang, Yunchao Wei, and Ling Chen. 2024. Foundations and recent trends in multimodal mobile agents: A survey. *arXiv preprint arXiv:2411.02006*.
+- Jinheng Xie, Weijia Mao, Zechen Bai, David Junhao Zhang, Weihao Wang, Kevin Qinghong Lin, Yuchao Gu, Zhijie Chen, Zhenheng Yang, and Mike Zheng Shou. 2024. Show-o: One single transformer to unify multimodal understanding and generation. *arXiv preprint arXiv:2408.12528*.
+- Jin Xu, Zhifang Guo, Jinzheng He, Hangrui Hu, Ting He, Shuai Bai, Keqin Chen, Jialin Wang, Yang Fan, Kai Dang, and 1 others. 2025. Qwen2. 5-omni technical report. *arXiv preprint arXiv:2503.20215*.
+- Qiyao Xue, Xiangyu Yin, Boyuan Yang, and Wei Gao. 2025. Phyt2v: Llm-guided iterative self-refinement for physics-grounded text-to-video generation. In *Proceedings of the Computer Vision and Pattern Recognition Conference*, pages 18826–18836.
+- John Yang, Carlos Jimenez, Alexander Wettig, Kilian Lieret, Shunyu Yao, Karthik Narasimhan, and Ofir Press. 2024a. Swe-agent: Agent-computer interfaces enable automated software engineering. *Advances in Neural Information Processing Systems*, 37:50528– 50652.
+- Ke Yang, Jiateng Liu, John Wu, Chaoqi Yang, Yi R Fung, Sha Li, Zixuan Huang, Xu Cao, Xingyao Wang, Yiquan Wang, and 1 others. 2024b. If llm is the wizard, then code is the wand: A survey on how code empowers large language models to serve as intelligent agents. *arXiv preprint arXiv:2401.00812*.
+- Rui Yang, Lin Song, Yanwei Li, Sijie Zhao, Yixiao Ge, Xiu Li, and Ying Shan. 2023a. Gpt4tools: Teaching large language model to use tools via self-instruction. *Advances in Neural Information Processing Systems*, 36:71995–72007.
+- Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Ehsan Azarnasab, Faisal Ahmed, Zicheng Liu, Ce Liu, Michael Zeng, and Lijuan Wang. 2023b. Mm-react: Prompting chatgpt for multimodal reasoning and action. *arXiv preprint arXiv:2303.11381*.
+- Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, and 1 others. 2024c. Cogvideox: Text-to-video diffusion
+
+models with an expert transformer. *arXiv preprint arXiv:2408.06072*.
+
+- Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik R Narasimhan, and Yuan Cao. 2023. React: Synergizing reasoning and acting in language models. In *The Eleventh International Conference on Learning Representations*.
+- Murong Yue, Wenlin Yao, Haitao Mi, Dian Yu, Ziyu Yao, and Dong Yu. 2024. Dots: Learning to reason dynamically in llms via optimal reasoning trajectories search. *arXiv preprint arXiv:2410.03864*.
+- Zeyu Zhang, Yiran Wang, Biao Wu, Shuo Chen, Zhiyuan Zhang, Shiya Huang, Wenbo Zhang, Meng Fang, Ling Chen, and Yang Zhao. 2024. Motion avatar: Generate human and animal avatars with arbitrary motion. *arXiv preprint arXiv:2405.11286*.
+- Hao Zheng, Xinyan Guan, Hao Kong, Jia Zheng, Weixiang Zhou, Hongyu Lin, Yaojie Lu, Ben He, Xianpei Han, and Le Sun. 2025a. Pptagent: Generating and evaluating presentations beyond text-to-slides. *arXiv preprint arXiv:2501.03936*.
+- Hao Zheng, Xinyan Guan, Hao Kong, Jia Zheng, Weixiang Zhou, Hongyu Lin, Yaojie Lu, Ben He, Xianpei Han, and Le Sun. 2025b. Pptagent: Generating and evaluating presentations beyond text-to-slides. *arXiv preprint arXiv:2501.03936*.
+- Zixiang Zhou, Yu Wan, and Baoyuan Wang. 2024. Avatargpt: All-in-one framework for motion understanding planning generation and beyond. In *Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition*, pages 1357–1366.
+
+## A Related Work
+
+### A.1 Document-to-Multimodal Generation
+
+Recent advances in large language models (LLMs) and multimodal generation have sparked growing interest in converting documents into diverse output formats, such as slides, posters, or audio summaries (Xu et al., 2025; Wang et al., 2025; Pang et al., 2025; Sun et al., 2024). Systems like PP-TAgent (Zheng et al., 2025b) and Doc2PPT (Fu et al., 2022) treat document-to-slide generation as a structured summarization problem, focusing on layout-aware slide construction. Other works, such as Paper2Poster (Pang et al., 2025) extend this idea by producing single-page visual summaries using layout planning and visual feedback. However, these systems typically generate static outputs and do not model time-dependent delivery such as narration or slide progression. Our work builds upon these foundations, but further introduces temporal planning and audio-visual synchronization, enabling the generation of fully narrated presentation videos.
+
+#### A.2 Vision-Language Agents
+
+Recent advances have highlighted the expanding capabilities of vision language models (VLMs) beyond traditional language understanding. Techniques such as ReAct (Yao et al., 2023; Yang et al., 2023b; Yue et al., 2024) have shown that LLMs can operate as autonomous agents, capable of stepby-step reasoning and dynamic interaction through code execution (Wang et al., 2024c; Yang et al., 2024a,b), API function calls (Schick et al., 2023; Lu et al., 2025; Yang et al., 2023a), user interface manipulation (Lin et al., 2024b; Qin et al., 2025; Nayak et al., 2025; Wu et al., 2024), and motion generation (Zhang et al., 2024; Zhou et al., 2024; Wang et al., 2024d). Despite these developments, general-purpose agents still struggle with professional tasks that demand accuracy, domainspecific knowledge, and reliable interaction (Lin et al., 2024a). A closely related area is slide automation (Ge et al., 2025; Zheng et al., 2025a), which agents translate short text prompts into executable Python code to render presentation slides. In contrast, our proposed presentation video generation task is significantly more challenging: instead of taking a short prompt as input, the system processes an entire long-form document—such as a research paper, product manual, or technical report—and produces a well-structured presentation
+
+video with oral-style narration. This task imposes higher demands on content understanding, multimodal alignment, speech generation, and video synthesis. To address these challenges, we design a generation pipeline along with an automatic evaluation framework to systematically assess the generated videos in terms of information delivery, visual quality, and overall comprehensibility.
+
+sectionImplementation Details PresentAgent is implemented using a modular architecture that integrates LLMs, VLMs, and text-to-speech (TTS) systems. Our primary models include LLMs (GPT-4o, GPT-4o-mini, Claude-3.7-sonnet) and VLMs (Qwen-VL-Max, Gemini-2.5-Flash, Gemini-2.5- Pro). For TTS systems, we choose the MegaTTS3 model for better performance. For visual and multimodal evaluation, we use Qwen-VL-2.5-3B-Instruct as VLM.
+
+In our experimental pipeline, any input document is automatically transformed into a Power-Point deck, paired with a generated audio narration, and then composited into a synchronized video presentation.
+
+## B Implementation Details
+
+PresentAgent adopts a highly modular multimodalgeneration architecture. At the languageunderstanding and generation layer, we run six primary LLM back ends in parallel—GPT-4o, GPT-4o-mini, Qwen-VL-Max, Gemini-2.5-Flash, Gemini-2.5-Pro, and Claude-3.7-Sonnet—and select or ensemble them on-the-fly with a dynamic routing policy that weighs input length, conversational complexity, and latency budget. For visuallanguage evaluation, we introduce the lightweight VLM Qwen-VL-2.5-3B-Instruct to score slide layout, chart readability, and cross-modal consistency, feeding its self-critique back into generation. Speech synthesis is unified on MegaTTS3, which outputs 24 kHz, 16-bit high-fidelity narration and supports prosody-tag controls for fine-grained rate, pitch, and emotion adjustment.
+
+The experimental pipeline converts any input document—PDF, Markdown, DOCX, or web snapshot through three automated stages:
+
+1. Structured parsing & re-ordering that maps content to a hierarchical topic–subtopic tree.
+
+2. Per-slide generation with the chosen LLM, producing a PowerPoint deck containing titles, bullet points, graphic placeholders, and Alt-Text, while retrieving and inserting relevant images for
+
+key nouns.
+
+3. Synchronized narration generation with MegaTTS3 in Chinese or English, followed by an FFmpeg script that assembles a 1080 p video with fade-in/out transitions and optional captions.
+
+## C Discussion
+
+In this work, we synthesized presentation-style videos that integrate visual slides, textual narration, and spoken audio, simulating realistic multimodal communication scenarios. While our current evaluation focuses on the individual quality of each modality—such as visual clarity, textual relevance, and audio intelligibility—these dimensions are treated independently. However, in real-world applications, the effectiveness of communication often hinges on the semantic and temporal coherence across modalities.
+
+Future research should thus move beyond isolated assessments and aim toward fusion-aware understanding and evaluation. This entails not only modeling the interactions and alignment among image, audio, and text modalities, but also enabling the system to reason over their combined meaning. Existing models like ImageBind offer a unified embedding space for multiple modalities, but lack the capacity for high-level inference and semantic comprehension.
+
+A promising direction lies in bridging representation alignment with multimodal reasoning, by integrating aligned modality encoders with powerful language models. This would allow the system to jointly perceive, interpret, and respond to complex multimodal inputs—such as explaining a visual concept based on both audio narration and visual cues, or identifying inconsistencies across modalities. Developing such reasoning-capable, fusion-aware models will be critical for advancing robust, coherent multimodal understanding in real-world applications.
+
+# D Limitations
+
+Our work faces two key constraints: (1) Due to the high computational costs of commercial LLM/VLM APIs (e.g., GPT-4o and Gemini-2.5- Pro), evaluation was limited to five academic papers, potentially underrepresenting the document diversity shown in our benchmark (Figure 5); (2) PresentAgent currently generates static slides without dynamic animations/effects due to architectural constraints in video synthesis and trade-offs
+
+between generation speed and visual quality, as noted in ChronoMagic-Bench's temporal coherence studies. Future improvements could involve lightweight distillation models and physics-aware rendering engines.
+
+## E Evaluation Benchmark
+
+As Shown in Figure 5, we showcase four of the representative document types in our benchmark: academic papers, web pages, technical blogs, and presentation slides. These documents cover a broad spectrum of real-world content domains, such as educational tutorials, research briefs, product manuals, scientific articles, news commentary, and business reports. Each document is paired with a manually authored presentation video, providing a diverse and realistic testbed for evaluating documentto-video generation systems in terms of multimodal coherence, content preservation, and presentation quality.
+
+## F Doc2Present Dataset Details
+
+Data Source. We collect 30 high-quality video samples from public platforms, educational repositories, and professional presentation archives. Each video follows a structured narration format, combining slide-based visuals with synchronized voiceover. We manually align each video with its source document and ensure the following conditions are met: (1) the content structure of the video follows that of the document; (2) the visuals convey document information in a compact, structured form; and (3) the narration and slides are well-aligned temporally.
+
+Data Statistics. The average document length is 3,000–8,000 words, while the corresponding videos range from 1 to 2 minutes and contain 5-10 slides. This setting highlights the core challenge of the task: transforming dense, domain-specific documents into effective and digestible multimodal presentations.
+
+# G PresentEval
+
+## G.1 Prompts of Objective Quiz Evaluation
+
+Table 2 presents the prompting content for the evaluation method utilizing objective quiz-based assessment. Each set of questions included in this evaluation is crafted manually, with its creation firmly rooted in the actual content of the relevant documents. The formulation of these questions
+
+
+
+Figure 5: Document Diversity in Our Evaluation Benchmark.
+
+| Prensentation of Web Pages | What is the main feature highlighted in the iPhone's promotional webpage? |
+| --- | --- |
+| A. | A more powerful chip for faster performance |
+| B. | A brighter and more vibrant display |
+| C. | An upgraded camera system with better lenses |
+| D. | A longer-lasting and more efficient battery |
+| Prensentation of Academic Paper | What primary research gap did the authors aim to address by introducing the FineGym dataset? |
+| A. | Lack of low-resolution sports footage for compression studies |
+| B. | Need for fine-grained action understanding that goes beyond coarse categories |
+| C. | Absence of synthetic data to replace human annotations |
+| D. | Shortage of benchmarks for background context recognition |
+
+Table 2: Prompt of evaluation via Objective Quiz Evaluation. Each question set is manually created based on the actual document content, with a focus on topic recognition, structural understanding, and key argument identification. These questions evaluate how well the generated video communicates the source material.
+
+places a distinct emphasis on three key aspects: topic recognition, which involves the ability to accurately identify and grasp the central themes of the source material; structural understanding, referring to the comprehension of the organizational framework and logical arrangement of the document; and key argument identification, focusing on the capacity to pinpoint the core viewpoints and supporting arguments within the content. These carefully designed questions serve as a means to evaluate the extent to which the generated video successfully conveys the essential information, core ideas, and structural logic of the original source material, thereby assessing the effectiveness of the video in communicating the source content.
+
+### G.2 Prompts of Subjective Scoring
+
+Prompt of evaluation via subjective scoring is shown in table 3. This table showcases the prompting content employed in the subjective scoringbased evaluation approach. Each individual prompt within this set is precisely targeted at a specific evaluative dimension. These dimensions encompass
+
+narrative coherence, which pertains to the logical flow and consistency of the storytelling; visual appeal and audio appeal, focusing on the attractiveness and engaging nature of the visual elements and audio components respectively; and comprehension difficulty, referring to the level of ease or challenge in understanding the presented content. These prompts are meticulously designed to serve as a guiding framework for vision-language models, enabling them to assess presentations from a human-centric perspective. This means that the evaluation aligns with human perceptions, preferences, and ways of understanding, ensuring that the assessment results are more in line with how humans would judge the quality of the presentations.
+
+# H Evaluation Setup
+
+We construct a test set consisting of 30 long-form documents, each paired with a manually created presentation video that serves as a human-level reference. These documents span a diverse range of topics, including education, product explanation,
+
+| Video | Scoring Prompt |
+| --- | --- |
+| Narr. Coh. | "How coherent is the narration across the video? Are the ideas logically connected and easy to follow?" |
+| Visual Appeal | "How would you rate the visual design of the slides in terms of layout, aesthetics, and overall quality?" |
+| Comp. Diff. | "How easy is it to understand the presentation as a viewer? Were there any confusing or contradictory parts?" |
+| Audio | Scoring Prompt |
+| Narr. Coh. | "How coherent is the narration throughout the audio? Are the ideas logically structured and easy to follow?" |
+| Audio Appeal | "How pleasant and engaging is the narrator's voice in terms of tone, pacing, and delivery?" |
+| Comp. Diff. | "How easy is it to understand the spoken content? Were there any unclear or confusing parts in the audio?" |
+
+Table 3: Prompt of evaluation via Subjective Scoring. Each prompt targets a specific dimension—narrative coherence, visual/audio appeal, or comprehension difficulty—and is designed to guide vision-language models in assessing presentations from a human-centric perspective. Abbreviations: Narr. Coh. = Narrative Coherence; Comp. Diff. = Comprehension Difficulty.
+
+research overviews, and policy briefings. For each document, we generate a corresponding presentation video using our full generation pipeline.
+
+All videos, both human-created and machinegenerated, are evaluated using our unified evaluation framework, PresentEval. Each synthesized video is approximately two minutes in length. However, due to the current lack of a single multimodal model capable of jointly assessing visual and audio quality for videos longer than two minutes, we adopt a split evaluation strategy.
+
+In the Objective Quiz stage, we use Qwen-VL-2.5-3B (Wang et al., 2024b) to evaluate the accuracy of the entire video using a fixed set of multiplechoice comprehension questions. In the Subjective Scoring stage, we extract short video/audio segments and evaluate them individually to assess quality in a more focused and scalable manner, using Qwen-Omni-7B (Xu et al., 2025).
+
+Both models are guided by dimension-specific prompts and score each video or audio sample along three axes: Content Quality, Visual Quality, and Comprehension Accuracy.
+
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/source.pdf b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/source.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..2884a9ae9161d6fcd7ed6c8463d9127e17a8c786
--- /dev/null
+++ b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/source.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d30fa77088faf2e5e9a1ea32d787b218e7a5a455f07ed9c192694ed5174871d
+size 1829248
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_21d2.png b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_21d2.png
new file mode 100644
index 0000000000000000000000000000000000000000..64f93310f0f4dbce6c6a07ce5c5680a0f0a52c69
Binary files /dev/null and b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_21d2.png differ
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_efca.png b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_efca.png
new file mode 100644
index 0000000000000000000000000000000000000000..3dadb52863c962d427a31e338201c7734c30ae0b
Binary files /dev/null and b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_efca.png differ
diff --git a/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_f5f7.png b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_f5f7.png
new file mode 100644
index 0000000000000000000000000000000000000000..f547e4a71602bd2677bd542efa6f579a9b5aa058
Binary files /dev/null and b/pptagent/runs/pdf/9145dbfce1296e2b0603293042aa883e/table_f5f7.png differ
diff --git a/pptagent/runs/ppt_video/ca046385-ac3d-4240-9284-a96c57d934d3/output.mp4 b/pptagent/runs/ppt_video/ca046385-ac3d-4240-9284-a96c57d934d3/output.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..cff93b20629d09c45ed457acd32e831a9bad3c60
--- /dev/null
+++ b/pptagent/runs/ppt_video/ca046385-ac3d-4240-9284-a96c57d934d3/output.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33ae87a36e52d48b8ace4eae461d382ea9a42125b550d223a4ffeb213e06fc36
+size 484166
diff --git a/pptagent/runs/ppt_video/ca046385-ac3d-4240-9284-a96c57d934d3/source.pdf b/pptagent/runs/ppt_video/ca046385-ac3d-4240-9284-a96c57d934d3/source.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..d431cc5a975e8ae84f9453bc2d604c6b8a525f32
Binary files /dev/null and b/pptagent/runs/ppt_video/ca046385-ac3d-4240-9284-a96c57d934d3/source.pdf differ
diff --git a/pptagent/runs/ppt_video/ca046385-ac3d-4240-9284-a96c57d934d3/source.pptx b/pptagent/runs/ppt_video/ca046385-ac3d-4240-9284-a96c57d934d3/source.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..d363ad7656b7bd0c87f202adf28763fb6772d8da
Binary files /dev/null and b/pptagent/runs/ppt_video/ca046385-ac3d-4240-9284-a96c57d934d3/source.pptx differ
diff --git a/pptagent/runs/ppt_video/e88b9f32-6b97-4096-abd6-9bee103524b6/output.mp4 b/pptagent/runs/ppt_video/e88b9f32-6b97-4096-abd6-9bee103524b6/output.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..be481a7a96bdfa7d91bccefc0374794b032864db
--- /dev/null
+++ b/pptagent/runs/ppt_video/e88b9f32-6b97-4096-abd6-9bee103524b6/output.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5eee4df12ff7a893d12cdce87dee3e2df826ada26b50807090a63af7a3f31d92
+size 4414794
diff --git a/pptagent/runs/ppt_video/e88b9f32-6b97-4096-abd6-9bee103524b6/source.pdf b/pptagent/runs/ppt_video/e88b9f32-6b97-4096-abd6-9bee103524b6/source.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..4d96791f6a89cb0c762e387bacef328280181c91
Binary files /dev/null and b/pptagent/runs/ppt_video/e88b9f32-6b97-4096-abd6-9bee103524b6/source.pdf differ
diff --git a/pptagent/runs/ppt_video/e88b9f32-6b97-4096-abd6-9bee103524b6/source.pptx b/pptagent/runs/ppt_video/e88b9f32-6b97-4096-abd6-9bee103524b6/source.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..d363ad7656b7bd0c87f202adf28763fb6772d8da
Binary files /dev/null and b/pptagent/runs/ppt_video/e88b9f32-6b97-4096-abd6-9bee103524b6/source.pptx differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/image_stats.json b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/image_stats.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed2e58521de66dd3f511511ed0c69b6a11765779
--- /dev/null
+++ b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/image_stats.json
@@ -0,0 +1,93 @@
+{
+ "a1c98d25e5c2a3059235733edc58ea6984e75dc9.png": {
+ "size": [
+ 2873,
+ 1069
+ ],
+ "appear_times": 1,
+ "slide_numbers": [
+ 1
+ ],
+ "relative_area": 6.404320987654321,
+ "top_ranges_str": "1",
+ "caption": "Logo: BMVC 2024 logo featuring a blue circuit-like pattern resembling a camera or technological device on the left side with \"BMVC 2024\" text in dark blue on the right."
+ },
+ "83d1124da2030bef8f40da55db202923268685e2.png": {
+ "size": [
+ 296,
+ 296
+ ],
+ "appear_times": 1,
+ "slide_numbers": [
+ 1
+ ],
+ "relative_area": 2.3909465020576133,
+ "top_ranges_str": "1",
+ "caption": "Picture: A blank white image with no visible content or elements to describe."
+ },
+ "5452ff4f227c6ba1d7ad666974203486e642daf6.png": {
+ "size": [
+ 1672,
+ 703
+ ],
+ "appear_times": 1,
+ "slide_numbers": [
+ 6
+ ],
+ "relative_area": 74.5679012345679,
+ "top_ranges_str": "6",
+ "caption": "Diagram: An illustration of wolf motion synthesis showing original actions (Howl, Walk, Attack, Die) and expanded combined motions through a SinMDM model, with a workflow demonstrating how text prompts about wolf behaviors are refined through AI models to generate detailed motion descriptions."
+ },
+ "35639ff12c3127b2ba9419b7c784b212753ff628.png": {
+ "size": [
+ 1221,
+ 524
+ ],
+ "appear_times": 1,
+ "slide_numbers": [
+ 9
+ ],
+ "relative_area": 66.79149519890261,
+ "top_ranges_str": "9",
+ "caption": "Diagram: A comprehensive AI pipeline showing the process of generating 3D animated jaguar models from text prompts, including SDXL for image creation, TripoSR for 3D mesh conversion, and MoMASK for motion sequence generation."
+ },
+ "203e2300314026057b7257a3c105a8d2fad5183e.png": {
+ "size": [
+ 900,
+ 506
+ ],
+ "appear_times": 1,
+ "slide_numbers": [
+ 10
+ ],
+ "relative_area": 22.124485596707817,
+ "top_ranges_str": "10",
+ "caption": "Diagram: A collection of sequential animation frames showing various character movements including people, wolves, horses, and jaguars in different actions like walking, running, jumping, and attacking."
+ },
+ "4bbdd852ecafe7b9f1c65dfdbba4a04a5de91642.png": {
+ "size": [
+ 1233,
+ 225
+ ],
+ "appear_times": 1,
+ "slide_numbers": [
+ 13
+ ],
+ "relative_area": 24.963991769547324,
+ "top_ranges_str": "13",
+ "caption": "Table: Comparison of model performance metrics showing LLM Planner outperforming LLaMA-7B with significantly higher accuracy scores across animal, motion, and overall categories, with green numbers indicating percentage improvements."
+ },
+ "fee3a1e81ae1678f114d5799e440cc2b7d740aa1.png": {
+ "size": [
+ 1636,
+ 988
+ ],
+ "appear_times": 1,
+ "slide_numbers": [
+ 14
+ ],
+ "relative_area": 39.7136488340192,
+ "top_ranges_str": "14",
+ "caption": "Diagram: Collection of 3D character models showing various animated creatures including fantasy characters (demon and dragon-themed anime figures, yellow Yacuruna), animals (bear, cats, dogs, horses) and a cobra snake, each displayed from multiple angles with identifying labels."
+ }
+}
\ No newline at end of file
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/203e2300314026057b7257a3c105a8d2fad5183e.png b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/203e2300314026057b7257a3c105a8d2fad5183e.png
new file mode 100644
index 0000000000000000000000000000000000000000..b9d3f1d64409d4393873a9f52c029ceb45ef80f5
--- /dev/null
+++ b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/203e2300314026057b7257a3c105a8d2fad5183e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e36ae7d668c0382125d2a205bbb18d16d9c3df01e81d4a26c5320f94dbf5e793
+size 333197
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/35639ff12c3127b2ba9419b7c784b212753ff628.png b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/35639ff12c3127b2ba9419b7c784b212753ff628.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3f8f6a0a6dff29ae09a12161aa51f09f99cbef6
--- /dev/null
+++ b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/35639ff12c3127b2ba9419b7c784b212753ff628.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b5266de03220e670712288c7f1273ac56cd458a23999edfe11151336c7d7b96
+size 269874
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/4bbdd852ecafe7b9f1c65dfdbba4a04a5de91642.png b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/4bbdd852ecafe7b9f1c65dfdbba4a04a5de91642.png
new file mode 100644
index 0000000000000000000000000000000000000000..72c3f9e8fe459f6a58b1dc0e9ca108d6a50f71ef
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/4bbdd852ecafe7b9f1c65dfdbba4a04a5de91642.png differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/5452ff4f227c6ba1d7ad666974203486e642daf6.png b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/5452ff4f227c6ba1d7ad666974203486e642daf6.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c1d3ae54c72b64e086c466e53235cf0f8b22f5d
--- /dev/null
+++ b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/5452ff4f227c6ba1d7ad666974203486e642daf6.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:950b236ee7927e02c41e1c9f4d7101e283a5c3cc6e53a700b8e0a7f9326fa288
+size 657026
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/83d1124da2030bef8f40da55db202923268685e2.png b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/83d1124da2030bef8f40da55db202923268685e2.png
new file mode 100644
index 0000000000000000000000000000000000000000..69d457b3cb268f271026f2ec3d22c76d720c59ac
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/83d1124da2030bef8f40da55db202923268685e2.png differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/a1c98d25e5c2a3059235733edc58ea6984e75dc9.png b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/a1c98d25e5c2a3059235733edc58ea6984e75dc9.png
new file mode 100644
index 0000000000000000000000000000000000000000..7373b9902d892abf7ecdbb3ed19b9c21d23d5414
--- /dev/null
+++ b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/a1c98d25e5c2a3059235733edc58ea6984e75dc9.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e39c960001e890ba64a2fad3c7b2d2773228ab5ad894c2c381d15900b8f57041
+size 277111
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/fee3a1e81ae1678f114d5799e440cc2b7d740aa1.png b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/fee3a1e81ae1678f114d5799e440cc2b7d740aa1.png
new file mode 100644
index 0000000000000000000000000000000000000000..773a1db61b92e367a5ee707bc9077efba7e713ed
--- /dev/null
+++ b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/images/fee3a1e81ae1678f114d5799e440cc2b7d740aa1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:837401a779f30e28375fe8ee0305bab849fd6e6aef33d70d11034e7a5a8cf7c4
+size 915138
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0001.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0001.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4afa82fede49c5107d112f68a58b5b55a8cd39f7
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0001.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0002.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0002.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5594e72fdb6f3e6b47237747790de9fc0853477d
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0002.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0003.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0003.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f01013c0bb2cb9d517840f827d64be22a078619f
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0003.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0004.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0004.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1a10ab19edf051efd52d339f94c97ecc4c29ee9a
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0004.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0005.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2c9823521b8ec5ca93c3410f5955d293e4f8758a
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0005.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0006.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0006.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d30412cc7786f3a120ea604492b86102014fd47d
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0006.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0007.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0007.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a58df1eaac9841f46cfe3eb349f2a0b0a81cf64b
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0007.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0008.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0008.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..62750ef341f5e7c2caccaa3a537eb0830de81a61
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0008.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0009.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0009.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3e7792617b3b8e34737d1ce32179c66575295628
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0009.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0010.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0010.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..af5d9d959e75b7b381335d3968425df993f72afd
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0010.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0011.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0011.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3a014b542668a8e07414e8e20dbdee602958af7e
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0011.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0012.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0012.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c256d90012f3c7a06930099de89dbbeec677d102
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0012.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0013.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0013.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..24b11ee44f85eb8edff22b7b9b12fc1caf3f9edb
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0013.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0014.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0014.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0d20a57198eed7c0204a45fbdecd2e884d000f3e
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0014.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0015.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0015.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..da2054cbca0978c7b4f580645592e50dc1a332fc
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0015.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0016.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0016.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1b765f3c0e4d59ae686e25be8f204e36ee69f9b1
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0016.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0017.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0017.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cd435d0673f1eaee304fa9a89da0be7dcd57b0c9
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0017.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0018.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0018.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..453616f7ce223a545578f0dde092269851bfebd9
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/slide_images/slide_0018.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/source.pptx b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/source.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..a4663336820e0f6db49b6bd7f26e3bce03ee668c
--- /dev/null
+++ b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/source.pptx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3795ad16408515f6f3041fe35547e1cddc3a070e348b5660160f09d63563d343
+size 2877057
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template.pptx b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..29005f046a563309b51a38951a33a25dfc8735f2
--- /dev/null
+++ b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template.pptx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b41577e048ab4f6d384bc9629f53c45b1311f3a23e4e644d55af441edb478eb4
+size 368779
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0001.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0001.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8993b2a0c9ab32813b06c9157bb4e0fd010b0ac1
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0001.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0002.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0002.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7193af890545add528dd1a82c7ecd50f092c2ebc
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0002.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0003.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0003.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1db26b05439dd60c93f69584856a34cfc86fb1a4
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0003.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0004.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0004.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..787b0ec2e0443ac6112120f42d5f51027826394c
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0004.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0005.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c80354cdd62da17414de89e58b99d92fa7e683a7
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0005.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0006.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0006.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..43174fd628998fb699d6f3380a6e35d99fb8af2d
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0006.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0007.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0007.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..20b624eab11a2a6b79a0bada36e3839f4ec37c11
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0007.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0008.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0008.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..743da6e90377da82b9bcd28e2c31f629671c0741
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0008.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0009.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0009.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4ebc7d3a91ae9d35b3dc0410fc75b06e59bd623f
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0009.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0010.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0010.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f913ef0e517bbbc69b0218f340f6b4ceb97b7b4e
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0010.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0011.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0011.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ae8b35c1683650390abacd4ad97a2ed905cd46d8
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0011.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0012.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0012.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..743da6e90377da82b9bcd28e2c31f629671c0741
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0012.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0013.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0013.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e828b66e145420cb63be3b033141a4372aaa236c
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0013.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0014.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0014.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6896c8410069799bc6bda64b99fdb24823b8232a
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0014.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0015.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0015.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a17dff3fcd38b1d53252c210660f3eeadd11bcaa
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0015.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0016.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0016.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bc4fd5eb06952701d677761b861959b12f3fc18f
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0016.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0017.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0017.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ffc264bb4945818793d417be091b33a3f8af160a
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0017.jpg differ
diff --git a/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0018.jpg b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0018.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2ff955d6f36bada4fa79a22b76c51d081d11c4b0
Binary files /dev/null and b/pptagent/runs/pptx/0210ff6b414902fa05857e734dd5bcee/template_images/slide_0018.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/image_stats.json b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/image_stats.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fdc28a7f53337d1986aeae4aa5d91aa0f7a17d6
--- /dev/null
+++ b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/image_stats.json
@@ -0,0 +1,15 @@
+{
+ "1df5f510a94dff77293458473e5407d97a31bdfe.png": {
+ "size": [
+ 784,
+ 878
+ ],
+ "appear_times": 1,
+ "slide_numbers": [
+ 3
+ ],
+ "relative_area": 33.9647633744856,
+ "top_ranges_str": "3",
+ "caption": "Diagram: Illustration showing an inclined plane with an angle \\( \\theta \\), a reference point \\( C \\), and various vectors labeled \\( \\mathbf{e} \\), \\( \\mathbf{e_z} \\), and \\( \\mathbf{C_{ptmext}} \\) indicating direction and measurement."
+ }
+}
\ No newline at end of file
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/images/1df5f510a94dff77293458473e5407d97a31bdfe.png b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/images/1df5f510a94dff77293458473e5407d97a31bdfe.png
new file mode 100644
index 0000000000000000000000000000000000000000..d339e4716273eeb6a74d85e1daa5d54555e46cac
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/images/1df5f510a94dff77293458473e5407d97a31bdfe.png differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0001.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0001.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6171ae480ea54e615bd8af9ce5fdc32b2c14a4d5
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0001.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0002.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0002.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3dcd2e296b9d78c514a722ae77a2cdea3da821d6
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0002.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0003.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0003.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..42cb03677628089dcafadaf00193415b1ae55ed4
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0003.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0004.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0004.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..651baca97938b5867121a95571bb96a6b3e2a12e
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0004.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0005.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2bc65109c1a60ec7b8082e5e4b3523867447748a
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0005.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0006.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0006.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f3748a58373a7a8b6072484b276bee3b5c1666f0
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_images/slide_0006.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_induction.json b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_induction.json
new file mode 100644
index 0000000000000000000000000000000000000000..f109f08cd0b84c79d7d6bfe63ceec83284a0f032
--- /dev/null
+++ b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/slide_induction.json
@@ -0,0 +1,122 @@
+{
+ "opening": {
+ "slides": [
+ 1
+ ],
+ "template_id": 1,
+ "content_schema": {
+ "presenters": {
+ "description": "name of the presenter",
+ "type": "text",
+ "data": [
+ "潘伟洲(josephpan)"
+ ]
+ },
+ "affiliation": {
+ "description": "presenter's affiliation or department",
+ "type": "text",
+ "data": [
+ "SNG-社交平台部-空间运营中心"
+ ]
+ },
+ "presentation date": {
+ "description": "date of the presentation",
+ "type": "text",
+ "data": [
+ "2025/4/29"
+ ]
+ },
+ "main title": {
+ "description": "main title of the presentation",
+ "type": "text",
+ "data": [
+ "移动客户端通道面试陈述"
+ ]
+ }
+ }
+ },
+ "table of contents": {
+ "slides": [
+ 2
+ ],
+ "template_id": 2,
+ "content_schema": {
+ "main title": {
+ "description": "main title of the slide",
+ "type": "text",
+ "data": [
+ "Table of Contents"
+ ]
+ },
+ "content bullets": {
+ "description": "content bullets of the slide",
+ "type": "text",
+ "data": [
+ "个人经历",
+ "项目经验 ",
+ "技术影响力",
+ "专业领域优势"
+ ]
+ }
+ }
+ },
+ "section outline": {
+ "slides": [
+ 3,
+ 4,
+ 5
+ ],
+ "template_id": 3,
+ "content_schema": {
+ "main title": {
+ "description": "main title of the slide",
+ "type": "text",
+ "data": [
+ "个人经历"
+ ]
+ },
+ "content paragraph": {
+ "description": "content paragraph of the slide",
+ "type": "text",
+ "data": [
+ "这张图展示了一个倾斜圆盘在空间中的几何关系。圆盘的法向量为 \\vec{e},与竖直方向单位向量 \\vec{e}z 之间夹角为 \\theta,表示圆盘的倾斜角度。圆盘中心为点 C,红色箭头 \\vec{C}{ptmext} 表示作用在该点上的外力或外力矩。该图常用于描述刚体在三维空间中的姿态与受力关系。"
+ ]
+ },
+ "main image": {
+ "description": "main image of the slide",
+ "type": "image",
+ "data": [
+ "Diagram: Illustration showing an inclined plane with an angle \\( \\theta \\), a reference point \\( C \\), and various vectors labeled \\( \\mathbf{e} \\), \\( \\mathbf{e_z} \\), and \\( \\mathbf{C_{ptmext}} \\) indicating direction and measurement."
+ ]
+ }
+ }
+ },
+ "ending": {
+ "slides": [
+ 6
+ ],
+ "template_id": 6,
+ "content_schema": {
+ "main title": {
+ "description": "main title of the slide",
+ "type": "text",
+ "data": [
+ "本次报告到此结束"
+ ]
+ },
+ "content paragraph": {
+ "description": "additional content or closing remarks of the slide",
+ "type": "text",
+ "data": [
+ "欢迎批评指正!"
+ ]
+ }
+ }
+ },
+ "functional_keys": [
+ "opening",
+ "table of contents",
+ "section outline",
+ "ending"
+ ]
+}
\ No newline at end of file
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/source.pptx b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/source.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..78d2eeb927d7f939becf52c190525b5f41520165
--- /dev/null
+++ b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/source.pptx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec684256c1f6edbc172b67432eb48c4fc2b68c7111630615a5719eb6e9117b7c
+size 129909
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template.pptx b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..c7f9759e4fd18b653649b3c56cec459bfaf0d711
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template.pptx differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0001.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0001.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9ba7d58564fc4bfe93b778572cec6720e0bb9955
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0001.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0002.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0002.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7b85d91751058b26613bb4f6400f0bbcf5b2dc61
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0002.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0003.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0003.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b366275bb58556d2bbf997920b0b423f2faba64c
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0003.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0004.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0004.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f44d8046f8448dd38df987799864c42c6ad4c3b0
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0004.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0005.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..94c2472fa24bf1f0d7524d43cfda09ee3798e370
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0005.jpg differ
diff --git a/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0006.jpg b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0006.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6e40e8d132a9d5bf4fa8eb3e7d1773f1de409da3
Binary files /dev/null and b/pptagent/runs/pptx/c1eb4d337b2aa71bec0b0bda89322db2/template_images/slide_0006.jpg differ
diff --git a/pptagent/utils.py b/pptagent/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a6f00408155d8c4ae435f0a05e1cda36f2f4195
--- /dev/null
+++ b/pptagent/utils.py
@@ -0,0 +1,664 @@
+import asyncio
+import json
+import logging
+import os
+import shutil
+import subprocess
+import tempfile
+import traceback
+from itertools import product
+from shutil import which
+from time import sleep, time
+from typing import Any, Optional
+
+import json_repair
+import Levenshtein
+from html2image import Html2Image
+from mistune import html as markdown
+from pdf2image import convert_from_path
+from PIL import Image as PILImage
+from pptx.dml.color import RGBColor
+from pptx.oxml import parse_xml
+from pptx.parts.image import Image
+from pptx.shapes.group import GroupShape
+from pptx.text.text import _Paragraph, _Run
+from pptx.util import Length, Pt
+from tenacity import RetryCallState, retry, stop_after_attempt, wait_fixed
+
+
+def get_logger(name="pptagent", level=None):
+ """
+ Get a logger with the specified name and level.
+
+ Args:
+ name (str): The name of the logger.
+ level (int): The logging level (default: logging.INFO).
+
+ Returns:
+ logging.Logger: A configured logger instance.
+ """
+ if level is None:
+ level = int(os.environ.get("LOG_LEVEL", logging.INFO))
+
+ logger = logging.getLogger(name)
+ logger.setLevel(level)
+
+ # Check if the logger already has handlers to avoid duplicates
+ if not logger.handlers:
+ # Create console handler and set level
+ console_handler = logging.StreamHandler()
+ console_handler.setLevel(level)
+
+ # Create formatter
+ formatter = logging.Formatter(
+ "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s"
+ )
+ console_handler.setFormatter(formatter)
+
+ # Add handler to logger
+ logger.addHandler(console_handler)
+
+ return logger
+
+
+logger = get_logger(__name__)
+
+if which("soffice") is None:
+ logging.warning("soffice is not installed, pptx to images conversion will not work")
+
+# Set of supported image extensions
+IMAGE_EXTENSIONS: set[str] = {
+ "bmp",
+ "jpg",
+ "jpeg",
+ "pgm",
+ "png",
+ "ppm",
+ "tif",
+ "tiff",
+ "webp",
+}
+
+# Common colors and measurements
+BLACK = RGBColor(0, 0, 0)
+YELLOW = RGBColor(255, 255, 0)
+BLUE = RGBColor(0, 0, 255)
+BORDER_LEN = Pt(2)
+BORDER_OFFSET = Pt(2)
+LABEL_LEN = Pt(24)
+FONT_LEN = Pt(20)
+
+
+def is_image_path(file: str) -> bool:
+ """
+ Check if a file path is an image based on its extension.
+
+ Args:
+ file (str): The file path to check.
+
+ Returns:
+ bool: True if the file is an image, False otherwise.
+ """
+ return file.split(".")[-1].lower() in IMAGE_EXTENSIONS
+
+
+def runs_merge(paragraph: _Paragraph) -> Optional[_Run]:
+ """
+ Merge all runs in a paragraph into a single run.
+
+ Args:
+ paragraph (_Paragraph): The paragraph to merge runs in.
+
+ Returns:
+ Optional[_Run]: The merged run, or None if there are no runs.
+ """
+ runs = paragraph.runs
+
+ # Handle field codes
+ if len(runs) == 0:
+ runs = [
+ _Run(r, paragraph)
+ for r in parse_xml(paragraph._element.xml.replace("fld", "r")).r_lst
+ ]
+ if len(runs) == 1:
+ return runs[0]
+ if len(runs) == 0:
+ return None
+
+ # Find the run with the most text
+ run = max(runs, key=lambda x: len(x.text))
+ run.text = paragraph.text
+
+ # Remove other runs
+ for r in runs:
+ if r != run:
+ r._r.getparent().remove(r._r)
+ return run
+
+
+def older_than(filepath: str, seconds: int = 10, wait: bool = False) -> bool:
+ """
+ Check if a file is older than a specified number of seconds.
+
+ Args:
+ filepath (str): The path to the file.
+ seconds (int): The number of seconds to check against.
+ wait (bool): Whether to wait for the file to exist.
+
+ Returns:
+ bool: True if the file is older than the specified number of seconds, False otherwise.
+ """
+ if not os.path.exists(filepath):
+ while wait:
+ logger.info("waiting for: %s", filepath)
+ sleep(1)
+ if os.path.exists(filepath):
+ sleep(seconds)
+ return True
+ return False
+ file_creation_time = os.path.getctime(filepath)
+ current_time = time()
+ return seconds < (current_time - file_creation_time)
+
+
+def edit_distance(text1: str, text2: str) -> float:
+ """
+ Calculate the normalized edit distance between two strings.
+
+ Args:
+ text1 (str): The first string.
+ text2 (str): The second string.
+
+ Returns:
+ float: The normalized edit distance (0.0 to 1.0, where 1.0 means identical).
+ """
+ if not text1 and not text2:
+ return 1.0
+ return 1 - Levenshtein.distance(text1, text2) / max(len(text1), len(text2))
+
+
+def tenacity_log(retry_state: RetryCallState) -> None:
+ """
+ Log function for tenacity retries.
+
+ Args:
+ retry_state (RetryCallState): The retry state.
+ """
+ logger.warning("tenacity retry: %s", retry_state)
+ traceback.print_tb(retry_state.outcome.exception().__traceback__)
+
+
+def get_json_from_response(response: str) -> dict[str, Any]:
+ """
+ Extract JSON from a text response.
+
+ Args:
+ response (str): The response text.
+
+ Returns:
+ Dict[str, Any]: The extracted JSON.
+
+ Raises:
+ Exception: If JSON cannot be extracted from the response.
+ """
+ response = response.strip()
+
+ try:
+ return json.loads(response)
+ except Exception:
+ pass
+
+ # Try to extract JSON from markdown code blocks
+ l, r = response.rfind("```json"), response.rfind("```")
+ if l != -1 and r != -1:
+ json_obj = json_repair.loads(response[l + 7 : r].strip())
+ if isinstance(json_obj, (dict, list)):
+ return json_obj
+
+ # Try to find JSON by looking for matching braces
+ open_braces = []
+ close_braces = []
+
+ for i, char in enumerate(response):
+ if char == "{" or char == "[":
+ open_braces.append(i)
+ elif char == "}" or char == "]":
+ close_braces.append(i)
+
+ for i, j in product(open_braces, reversed(close_braces)):
+ if i > j:
+ continue
+ try:
+ json_obj = json_repair.loads(response[i : j + 1])
+ if isinstance(json_obj, (dict, list)):
+ return json_obj
+ except Exception:
+ pass
+
+ raise Exception("JSON not found in the given output", response)
+
+
+# Create a tenacity decorator with custom settings
+def tenacity_decorator(_func=None, *, wait: int = 3, stop: int = 5):
+ def decorator(func):
+ return retry(wait=wait_fixed(wait), stop=stop_after_attempt(stop))(func)
+
+ if _func is None:
+ # Called with arguments
+ return decorator
+ else:
+ # Called without arguments
+ return decorator(_func)
+
+
+TABLE_CSS = """
+table {
+ border-collapse: collapse; /* Merge borders */
+ width: auto; /* Width adapts to content */
+ font-family: SimHei, Arial, sans-serif; /* Font supporting Chinese characters */
+ background: white;
+}
+th, td {
+ border: 1px solid black; /* Add borders */
+ padding: 8px; /* Cell padding */
+ text-align: center; /* Center text */
+}
+th {
+ background-color: #f2f2f2; /* Header background color */
+}
+"""
+
+
+# Convert Markdown to HTML
+# def markdown_table_to_image(markdown_text: str, output_path: str):
+# """
+# Convert a Markdown table to a cropped image
+
+# Args:
+# markdown_text (str): Markdown text containing a table
+# output_path (str): Output image path, defaults to 'table_cropped.png'
+
+# Returns:
+# str: The path of the generated image
+# """
+# html = markdown(markdown_text)
+# assert "table" in html, "Failed to find table in markdown"
+
+# parent_dir, basename = os.path.split(output_path)
+# hti = Html2Image(
+# disable_logging=True,
+# output_path=parent_dir,
+# custom_flags=["--no-sandbox", "--headless"],
+# )
+# hti.browser.use_new_headless = None
+# hti.screenshot(html_str=html, css_str=TABLE_CSS, save_as=basename)
+
+# img = PILImage.open(output_path).convert("RGB")
+# bbox = img.getbbox()
+# assert (
+# bbox is not None
+# ), "Failed to capture the bbox, may be markdown table conversion failed"
+# bbox = (0, 0, bbox[2] + 10, bbox[3] + 10)
+# img.crop(bbox).save(output_path)
+# return output_path
+import pandas as pd
+import matplotlib.pyplot as plt
+from io import StringIO
+def markdown_table_to_image(markdown_text: str, output_path: str) -> str:
+ """
+ Convert a Markdown table to an image using pandas and matplotlib.
+
+ Args:
+ markdown_text (str): Markdown text containing a table.
+ output_path (str): Path to save the output image.
+
+ Returns:
+ str: The path of the generated image.
+ """
+ # Read markdown into DataFrame
+ df = pd.read_csv(
+ StringIO(markdown_text),
+ sep=r'\|',
+ engine='python',
+ skipinitialspace=True
+ )
+ # Remove empty columns that result from leading/trailing pipes
+ mask = [col.strip() != '' for col in df.columns]
+ df = df.loc[:, mask]
+
+ # Create figure and axis
+ fig, ax = plt.subplots()
+ ax.axis('off')
+
+ # Create table
+ table = ax.table(
+ cellText=df.values,
+ colLabels=df.columns.str.strip(),
+ cellLoc='center',
+ loc='center'
+ )
+ table.auto_set_font_size(False)
+ table.set_fontsize(12)
+ table.scale(1, 1.5)
+
+ # Save figure
+ fig.savefig(output_path, bbox_inches='tight', dpi=150)
+ plt.close(fig)
+ return output_path
+
+
+@tenacity_decorator
+def ppt_to_images(file: str, output_dir: str):
+ assert pexists(file), f"File {file} does not exist"
+ if pexists(output_dir):
+ logger.warning(f"ppt2images: {output_dir} already exists")
+ os.makedirs(output_dir, exist_ok=True)
+ with tempfile.TemporaryDirectory() as temp_dir:
+ command_list = [
+ "soffice",
+ "--headless",
+ "--convert-to",
+ "pdf",
+ file,
+ "--outdir",
+ temp_dir,
+ ]
+ process = subprocess.Popen(
+ command_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
+ out, err = process.communicate()
+ if process.returncode != 0:
+ raise RuntimeError(f"soffice failed with error: {err.decode()}")
+
+ for f in os.listdir(temp_dir):
+ if not f.endswith(".pdf"):
+ continue
+ temp_pdf = pjoin(temp_dir, f)
+ images = convert_from_path(temp_pdf, dpi=72)
+ for i, img in enumerate(images):
+ img.save(pjoin(output_dir, f"slide_{i+1:04d}.jpg"))
+ return
+
+ raise RuntimeError(
+ f"No PDF file was created in the temporary directory: {file}\n"
+ f"Output: {out.decode()}\n"
+ f"Error: {err.decode()}"
+ )
+
+
+@tenacity_decorator
+async def ppt_to_images_async(file: str, output_dir: str):
+ assert pexists(file), f"File {file} does not exist"
+ if pexists(output_dir):
+ logger.debug(f"ppt2images: {output_dir} already exists")
+ os.makedirs(output_dir, exist_ok=True)
+
+ with tempfile.TemporaryDirectory() as temp_dir:
+ command_list = [
+ "soffice",
+ "--headless",
+ "--convert-to",
+ "pdf",
+ file,
+ "--outdir",
+ temp_dir,
+ ]
+
+ process = await asyncio.create_subprocess_exec(
+ *command_list,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE,
+ )
+ stdout, stderr = await process.communicate()
+ if process.returncode != 0:
+ raise RuntimeError(f"soffice failed with error: {stderr.decode()}")
+ for f in os.listdir(temp_dir):
+ if not f.endswith(".pdf"):
+ continue
+ temp_pdf = pjoin(temp_dir, f)
+ images = convert_from_path(temp_pdf, dpi=72)
+ for i, img in enumerate(images):
+ img.save(pjoin(output_dir, f"slide_{i+1:04d}.jpg"))
+ return
+
+ raise RuntimeError(
+ f"No PDF file was created in the temporary directory: {file}\n"
+ f"Output: {stdout.decode()}\n"
+ f"Error: {stderr.decode()}"
+ )
+
+
+def parsing_image(image: Image, image_path: str) -> str:
+ # Handle WMF images (PDFs)
+ if image.ext == "wmf":
+ image_path = image_path.replace(".wmf", ".jpg")
+ if not pexists(image_path):
+ wmf_to_images(image.blob, image_path)
+ # Check for supported image types
+ elif image.ext not in IMAGE_EXTENSIONS:
+ raise ValueError(f"Unsupported image type {image.ext}")
+
+ # Save image if it doesn't exist
+ if not pexists(image_path):
+ with open(image_path, "wb") as f:
+ f.write(image.blob)
+ return image_path
+
+
+@tenacity_decorator
+def wmf_to_images(blob: bytes, filepath: str):
+ if not filepath.endswith(".jpg"):
+ raise ValueError("filepath must end with .jpg")
+ dirname = os.path.dirname(filepath)
+ basename = os.path.basename(filepath).removesuffix(".jpg")
+ with tempfile.TemporaryDirectory() as temp_dir:
+ with open(pjoin(temp_dir, f"{basename}.wmf"), "wb") as f:
+ f.write(blob)
+ command_list = [
+ "soffice",
+ "--headless",
+ "--convert-to",
+ "jpg",
+ pjoin(temp_dir, f"{basename}.wmf"),
+ "--outdir",
+ dirname,
+ ]
+ subprocess.run(command_list, check=True, stdout=subprocess.DEVNULL)
+
+ assert pexists(filepath), f"File {filepath} does not exist"
+
+
+def parse_groupshape(groupshape: GroupShape) -> list[dict[str, Length]]:
+ """
+ Parse a group shape to get the bounds of its child shapes.
+
+ Args:
+ groupshape (GroupShape): The group shape to parse.
+
+ Returns:
+ List[Dict[str, Length]]: The bounds of the child shapes.
+
+ Raises:
+ AssertionError: If the input is not a GroupShape.
+ """
+ assert isinstance(groupshape, GroupShape), "Input must be a GroupShape"
+
+ # Get group bounds
+ group_top_left_x = groupshape.left
+ group_top_left_y = groupshape.top
+ group_width = groupshape.width
+ group_height = groupshape.height
+
+ # Get shape bounds
+ shape_top_left_x = min([sp.left for sp in groupshape.shapes])
+ shape_top_left_y = min([sp.top for sp in groupshape.shapes])
+ shape_width = (
+ max([sp.left + sp.width for sp in groupshape.shapes]) - shape_top_left_x
+ )
+ shape_height = (
+ max([sp.top + sp.height for sp in groupshape.shapes]) - shape_top_left_y
+ )
+
+ # Calculate bounds for each shape in the group
+ group_shape_xy = []
+ for sp in groupshape.shapes:
+ group_shape_left = (
+ sp.left - shape_top_left_x
+ ) * group_width / shape_width + group_top_left_x
+ group_shape_top = (
+ sp.top - shape_top_left_y
+ ) * group_height / shape_height + group_top_left_y
+ group_shape_width = sp.width * group_width / shape_width
+ group_shape_height = sp.height * group_height / shape_height
+
+ group_shape_xy.append(
+ {
+ "left": Length(group_shape_left),
+ "top": Length(group_shape_top),
+ "width": Length(group_shape_width),
+ "height": Length(group_shape_height),
+ }
+ )
+
+ return group_shape_xy
+
+
+def is_primitive(obj: Any) -> bool:
+ """
+ Check if an object is a primitive type or a collection of primitive types.
+
+ Args:
+ obj (Any): The object to check.
+
+ Returns:
+ bool: True if the object is a primitive type or a collection of primitive types, False otherwise.
+ """
+ if isinstance(obj, (list, tuple, set, frozenset)):
+ return all(is_primitive(item) for item in obj)
+
+ return isinstance(
+ obj, (int, float, complex, bool, str, bytes, bytearray, type(None))
+ )
+
+
+DEFAULT_EXCLUDE: set[str] = {"element", "language_id", "ln", "placeholder_format"}
+
+
+def dict_to_object(
+ dict_obj: dict[str, Any], obj: Any, exclude: Optional[set[str]] = None
+) -> None:
+ """
+ Apply dictionary values to an object.
+
+ Args:
+ dict_obj (Dict[str, Any]): The dictionary with values to apply.
+ obj (Any): The object to apply values to.
+ exclude (Optional[Set[str]]): The keys to exclude.
+ """
+ if exclude is None:
+ exclude = set()
+
+ for key, value in dict_obj.items():
+ if key not in exclude and value is not None:
+ setattr(obj, key, value)
+
+
+def package_join(*paths: str) -> str:
+ """
+ Join paths with the appropriate separator for the platform.
+
+ Args:
+ *paths: The paths to join.
+
+ Returns:
+ str: The joined path.
+ """
+ _dir = pdirname(__file__)
+ return pjoin(_dir, *paths)
+
+
+class Config:
+ """
+ Configuration class for the application.
+ """
+
+ def __init__(
+ self,
+ rundir: Optional[str] = None,
+ session_id: Optional[str] = None,
+ ):
+ """
+ Initialize the configuration.
+
+ Args:
+ rundir (Optional[str]): The run directory.
+ session_id (Optional[str]): The session ID.
+ debug (bool): Whether to enable debug mode.
+ """
+ if rundir is not None:
+ self.set_rundir(rundir)
+ elif session_id is not None:
+ self.set_session(session_id)
+ else:
+ raise ValueError("No session ID or run directory provided")
+
+ def set_session(self, session_id: str) -> None:
+ """
+ Set the session ID and update the run directory.
+
+ Args:
+ session_id (str): The session ID.
+ """
+ self.session_id = session_id
+ self.set_rundir(f"./runs/{session_id}")
+
+ def set_rundir(self, rundir: str) -> None:
+ """
+ Set the run directory and create necessary subdirectories.
+
+ Args:
+ rundir (str): The run directory.
+ """
+ self.RUN_DIR = rundir
+ self.IMAGE_DIR = pjoin(self.RUN_DIR, "images")
+
+ for the_dir in [self.RUN_DIR, self.IMAGE_DIR]:
+ os.makedirs(the_dir, exist_ok=True)
+
+ def set_debug(self, debug: bool) -> None:
+ """
+ Set the debug mode.
+
+ Args:
+ debug (bool): Whether to enable debug mode.
+ """
+ self.DEBUG = debug
+
+ def remove_rundir(self) -> None:
+ """
+ Remove the run directory and its subdirectories.
+ """
+ if pexists(self.RUN_DIR):
+ shutil.rmtree(self.RUN_DIR)
+ if pexists(self.IMAGE_DIR):
+ shutil.rmtree(self.IMAGE_DIR)
+
+ def __repr__(self) -> str:
+ """
+ Get a string representation of the configuration.
+
+ Returns:
+ str: A string representation of the configuration.
+ """
+ attrs = []
+ for attr in dir(self):
+ if not attr.startswith("_") and not callable(getattr(self, attr)):
+ attrs.append(f"{attr}={getattr(self, attr)}")
+ return f"Config({', '.join(attrs)})"
+
+
+# Path utility functions
+pjoin = os.path.join
+pexists = os.path.exists
+pbasename = os.path.basename
+pdirname = os.path.dirname
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7a68dd7b00bf04de02107921ad790a5fca9c75d4
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,42 @@
+beautifulsoup4
+fastapi
+einops
+func_argparse
+html2image
+jinja2
+json_repair
+jsonlines
+lxml
+mistune
+marker-pdf==1.1.0
+oaib
+openai
+opencv-python-headless
+pandas
+pdf2image
+peft
+pillow
+PyPDF2
+python-Levenshtein
+python-multipart
+python-pptx @ git+https://github.com/Force1ess/python-pptx@219513d7d81a61961fc541578c1857d08b43aa2a
+rich
+socksio
+tenacity
+tiktoken
+timm
+uvicorn
+numpy<2
+setproctitle==1.3.3
+attrdict==2.0.1
+librosa==0.10.2.post1
+langdetect==1.0.9
+pydub==0.25.1
+pyloudnorm==0.1.1
+modelscope==1.22.2
+transformers==4.49.0
+x-transformers==1.44.4
+torchdiffeq==0.2.5
+openai-whisper==20240930
+httpx==0.28.1
+gradio==5.23.1
\ No newline at end of file
diff --git a/templates/Template1.pptx b/templates/Template1.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..a4663336820e0f6db49b6bd7f26e3bce03ee668c
--- /dev/null
+++ b/templates/Template1.pptx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3795ad16408515f6f3041fe35547e1cddc3a070e348b5660160f09d63563d343
+size 2877057
diff --git a/templates/Template2.pptx b/templates/Template2.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..62938aa70593fc1dc347ceb3575d44cab8338bf9
--- /dev/null
+++ b/templates/Template2.pptx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05b93d432ee3a347e571786e74cac0373d1d45803ad87dcdf079572c25d83f0a
+size 125824
diff --git a/templates/Template3.pptx b/templates/Template3.pptx
new file mode 100644
index 0000000000000000000000000000000000000000..bd348177e489acd8862b243e61de8c86fda8c938
--- /dev/null
+++ b/templates/Template3.pptx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96084fe919cdbdc91a66d340b0f9594b8f31b318dab24188d846dd2e1514ce84
+size 2472949
diff --git a/templates/previews/Template1.jpg b/templates/previews/Template1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e5d2572025dcccdedf2fddedf2f9db9e3fe660b2
--- /dev/null
+++ b/templates/previews/Template1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c081af9017bfdc9f4fed547de9e5040ef60b9279ee0464f7ab34b9a7facb726
+size 376407
diff --git a/templates/previews/Template2.jpg b/templates/previews/Template2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..db9abfd707567e3e9502beb27e087a03fbc9cbee
Binary files /dev/null and b/templates/previews/Template2.jpg differ
diff --git a/templates/previews/Template3.jpg b/templates/previews/Template3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..dbb603031b4cd4221435831aae75071392796338
Binary files /dev/null and b/templates/previews/Template3.jpg differ