Skip to content

Commit

Permalink
fix: megaparse.load & add tests (#202)
Browse files Browse the repository at this point in the history
* fix: megaparse.load & add tests

* fix needed version in ReadMe & remove python3.10 test in CI
  • Loading branch information
chloedia authored Dec 19, 2024
1 parent fbb7d36 commit 13c2677
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12"]
python-version: ["3.11", "3.12"]
steps:
- name: 👀 Checkout code
uses: actions/checkout@v2
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ https://github.com/QuivrHQ/MegaParse/assets/19614572/1b4cdb73-8dc2-44ef-b8b4-a75

## Installation

required python version >= 3.11

```bash
pip install megaparse
```
Expand Down
20 changes: 17 additions & 3 deletions libs/megaparse/src/megaparse/megaparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,24 @@ def load(
file: BinaryIO | None = None,
file_extension: str | FileExtension = "",
) -> str:
loop = asyncio.get_event_loop()
return loop.run_until_complete(
self.aload(file_path=file_path, file=file, file_extension=file_extension)
file_extension = self.validate_input(
file=file, file_path=file_path, file_extension=file_extension
)
try:
parser = self._select_parser(file_path, file, file_extension)
logger.info(f"Parsing using {parser.__class__.__name__} parser.")
parsed_document = parser.convert(
file_path=file_path, file=file, file_extension=file_extension
)
# @chloe FIXME: format_checker needs unstructured Elements as input which is to change
# if self.format_checker:
# parsed_document: str = await self.format_checker.check(parsed_document
self.last_parsed_document = parsed_document
return parsed_document
except Exception as e:
raise ParsingException(
f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}"
)

def _select_parser(
self,
Expand Down
10 changes: 9 additions & 1 deletion libs/megaparse/tests/pdf/test_pdf_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,21 @@ def test_get_default_processors_megaparse():

@pytest.mark.asyncio
@pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"])
async def test_megaparse_pdf_processor_file_path(pdf_name, request):
async def test_async_megaparse_pdf_processor_file_path(pdf_name, request):
pdf = request.getfixturevalue(pdf_name)
processor = MegaParse()
result = await processor.aload(file_path=pdf)
assert len(result) > 0


@pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"])
def test_sync_megaparse_pdf_processor_file_path(pdf_name, request):
pdf = request.getfixturevalue(pdf_name)
processor = MegaParse()
result = processor.load(file_path=pdf)
assert len(result) > 0


@pytest.mark.asyncio
@pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"])
async def test_megaparse_pdf_processor_file(pdf_name, request):
Expand Down

0 comments on commit 13c2677

Please sign in to comment.