-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This includes the sitemap so that we're sure no content is missed. Unlike api.jquery.com, api.jquerymobile.com does not start with an index that links to all content pages. This means the crawler would have to to rely on category pages to discover all content, except we don't want the cralwer to index /category/ pages, and thus are matched by stop_urls, which means they are never crawled. If there was a variant of `stop_urls` that behaved like `follow,noindex` instead of `noindex,follow` we could use that, but I'm not aware of such feature. The sitemap accomplishes the same thing in a more efficient manner. Ref jquery/infrastructure-puppet#33
- Loading branch information
Showing
2 changed files
with
90 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
name: typesense | ||
on: | ||
# Or after a deployment | ||
push: | ||
branches: | ||
- main | ||
- add-typesense | ||
# Or manually | ||
workflow_dispatch: | ||
|
||
jobs: | ||
typesense: | ||
name: Update Typesense | ||
if: ${{ github.repository_owner == 'jquery' }} # skip on forks | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Docsearch Scraper | ||
shell: bash | ||
run: | | ||
docker run \ | ||
-e TYPESENSE_API_KEY=${{ secrets.TYPESENSE_ADMIN_KEY }} \ | ||
-e TYPESENSE_HOST="${{ secrets.TYPESENSE_HOST }}" \ | ||
-e TYPESENSE_PORT="443" \ | ||
-e TYPESENSE_PROTOCOL="https" \ | ||
-e CONFIG="$(cat docsearch.config.json | jq -r tostring)" \ | ||
typesense/docsearch-scraper:0.8.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
{ | ||
"index_name": "jquerymobile_com", | ||
"start_urls": [ | ||
{ "url": "https://api.jquerymobile.com", "selectors_key": "api", "page_rank": 20 }, | ||
{ "url": "https://jquerymobile.com", "page_rank": 10 } | ||
], | ||
"sitemap_urls": [ | ||
"https://api.jquerymobile.com/wp-sitemap.xml" | ||
], | ||
"// stop_urls": [ | ||
"// Exclude URLs containing '?' such as /themeroller/?...", | ||
"// Avoid excluding https://jquerymobile.com/resources/ itself" | ||
], | ||
"stop_urls": [ | ||
"\\?", | ||
".com/category/", | ||
".com/resources/.+", | ||
".com\\/\\d\\." | ||
], | ||
"selectors": { | ||
"default": { | ||
"lvl0": { | ||
"selector": "#menu-top .menu-item.current > a", | ||
"global": true, | ||
"default_value": "Documentation" | ||
}, | ||
"lvl1": "#content h1", | ||
"lvl2": "#content h2", | ||
"lvl3": "#content h3", | ||
"lvl4": "#content h4", | ||
"lvl5": "#content h5", | ||
"text": "#content p, #content li, #content tr" | ||
}, | ||
"api": { | ||
"lvl0": { | ||
"selector": "#categories .cat-item.current-cat > a", | ||
"global": true, | ||
"default_value": "API" | ||
}, | ||
"lvl1": "#content h1", | ||
"lvl2": "#content h2, #content h4.name", | ||
"lvl3": "#content h3, #content h4:not(.name)", | ||
"lvl4": "#content h5, #content strong:first-child", | ||
"text": ".entry-content p, .entry-content li" | ||
} | ||
}, | ||
"custom_settings": { | ||
"token_separators": ["_", "-", "."] | ||
}, | ||
"selectors_exclude": [ | ||
"header ~ article", | ||
".returns", | ||
".version-details", | ||
".section-title", | ||
".icon-link.toc-link", | ||
"[class^=toclevel]", | ||
"#toctitle", | ||
".desc strong:first-child", | ||
"#quick-nav header h2" | ||
], | ||
"min_indexed_level": 2, | ||
"scrape_start_urls": false | ||
} |