From 74a53e1b9183d69bff0c8f1769c5dfb151eb8338 Mon Sep 17 00:00:00 2001 From: Timo Tijhof Date: Wed, 11 Oct 2023 14:50:54 -0700 Subject: [PATCH] Build: Enable typesense scraper Ref https://github.com/jquery/infrastructure-puppet/issues/33 --- .github/workflows/typesense.yaml | 30 ++++++++++++++++ docsearch.config.json | 59 ++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 .github/workflows/typesense.yaml create mode 100644 docsearch.config.json diff --git a/.github/workflows/typesense.yaml b/.github/workflows/typesense.yaml new file mode 100644 index 0000000..f96629a --- /dev/null +++ b/.github/workflows/typesense.yaml @@ -0,0 +1,30 @@ +name: typesense +on: + # Once a day at 11:30 UTC + schedule: + - cron: '30 11 * * *' + # Or after a deployment + push: + branches: + - main + - add-typesense + # Or manually + workflow_dispatch: + +jobs: + typesense: + name: Update Typesense + if: ${{ github.repository_owner == 'jquery' }} # skip on forks + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Docsearch Scraper + shell: bash + run: | + docker run \ + -e TYPESENSE_API_KEY=${{ secrets.TYPESENSE_ADMIN_KEY }} \ + -e TYPESENSE_HOST="${{ secrets.TYPESENSE_HOST }}" \ + -e TYPESENSE_PORT="443" \ + -e TYPESENSE_PROTOCOL="https" \ + -e CONFIG="$(cat docsearch.config.json | jq -r tostring)" \ + typesense/docsearch-scraper:0.8.0 diff --git a/docsearch.config.json b/docsearch.config.json new file mode 100644 index 0000000..16c209b --- /dev/null +++ b/docsearch.config.json @@ -0,0 +1,59 @@ +{ + "index_name": "jquerymobile_com", + "start_urls": [ + { "url": "https://api.jquerymobile.com", "selectors_key": "api", "page_rank": 20 }, + { "url": "https://jquerymobile.com", "page_rank": 10 } + ], + "// stop_urls": [ + "// Exclude URLs containing '?' such as /themeroller/?..." + ], + "stop_urls": [ + "\\?", + ".com/category/", + ".com/resources/", + ".com\\/\\d\\." + ], + "selectors": { + "default": { + "lvl0": { + "selector": "#menu-top .menu-item.current > a", + "global": true, + "default_value": "Documentation" + }, + "lvl1": "#content h1", + "lvl2": "#content h2", + "lvl3": "#content h3", + "lvl4": "#content h4", + "lvl5": "#content h5", + "text": "#content p, #content li, #content tr" + }, + "api": { + "lvl0": { + "selector": "#categories .cat-item.current-cat > a", + "global": true, + "default_value": "API" + }, + "lvl1": "#content h1", + "lvl2": "#content h2, #content h4.name", + "lvl3": "#content h3, #content h4:not(.name)", + "lvl4": "#content h5, #content strong:first-child", + "text": ".entry-content p, .entry-content li" + } + }, + "custom_settings": { + "token_separators": ["_", "-", "."] + }, + "selectors_exclude": [ + "header ~ article", + ".returns", + ".version-details", + ".section-title", + ".icon-link.toc-link", + "[class^=toclevel]", + "#toctitle", + ".desc strong:first-child", + "#quick-nav header h2" + ], + "min_indexed_level": 2, + "scrape_start_urls": false +}