Skip to content

Commit

Permalink
WIP Sync from Reddit (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
shakuzen authored Sep 18, 2023
1 parent a52c6f6 commit 7f61c69
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 0 deletions.
21 changes: 21 additions & 0 deletions .github/workflows/sync-from-reddit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Sync from Reddit

on:
schedule:
# run hourly at some arbitrary minute
- cron: 3 * * * *

jobs:
sync-from-reddit:
name: Sync from Reddit
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Configure git user
run: |
git config user.name 'github-actions[bot]'
git config user.email 'github-actions[bot]@users.noreply.github.com'
- name: Download wiki from Reddit
run: ./download-wiki.sh
- name: Push changes (if any)
run: git push
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
.docusaurus
.cache-loader

# Download folder for sync
/download

# Misc
.DS_Store
.idea
Expand Down
45 changes: 45 additions & 0 deletions download-wiki.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#! /bin/bash

# Requires: bash coreutils curl jq
# Prerequisite: git user is configured via git config

# Adapted from https://www.reddit.com/r/DataHoarder/comments/ga2p8y/comment/idpu8cs/

USER_AGENT='wikidownload/1.0'
DOWNLOAD_DIR="download"
DOCS_DIR="docs"
NONDOCS_DIR="archive"
SUBREDDIT="JapanFinance"

while read -r line; do
# Reddit's anonymous access rate limit is 10 requests per minute
# see https://www.reddit.com/r/redditdev/comments/14nbw6g/updated_rate_limits_going_into_effect_over_the/
sleep 6.1

PAGE="$line"
# strip index/ from beginning of page path
SOURCE_PAGE_JSON="./$DOWNLOAD_DIR/${PAGE#*index/}.json"
TARGET_PAGE_MD="./$DOCS_DIR/${PAGE#*index/}.md"
# do not publish config markdown, but archive it in this same repo
if [[ $PAGE == config* ]]; then TARGET_PAGE_MD="./$NONDOCS_DIR/$PAGE.md"; fi

# strip file name from end of path when making directories
mkdir -p "${SOURCE_PAGE_JSON%/*}"
mkdir -p "${TARGET_PAGE_MD%/*}"
curl -s --user-agent "$USER_AGENT" "https://www.reddit.com/r/$SUBREDDIT/wiki/$PAGE.json" > "$SOURCE_PAGE_JSON"
printf "$SUBREDDIT/wiki/$PAGE " ; echo $?

REASON="$(jq -r '.data.reason' "$SOURCE_PAGE_JSON")"
AUTHOR="$(jq -r '.data.revision_by.data.name' "$SOURCE_PAGE_JSON")"
# Rewrite wiki links before saving Markdown file
jq -r '.data.content_md' "$SOURCE_PAGE_JSON" | sed 's,https://www.reddit.com/r/JapanFinance/wiki/index/,,g' > "$TARGET_PAGE_MD"
# If the wiki page was changed, commit it.
if [ -n "$(git status --porcelain)" ]; then
git add "$TARGET_PAGE_MD"
git commit -m "Sync from Reddit" -m "$REASON" -m "Change made by u/$AUTHOR"
git --no-pager diff
fi

done < <(curl -S -s --user-agent "$USER_AGENT" "https://www.reddit.com/r/$SUBREDDIT/wiki/pages.json" | jq -r '.data | .[]')

rm -rf "./$DOWNLOAD_DIR"

0 comments on commit 7f61c69

Please sign in to comment.