Skip to content

Commit

Permalink
minutes-to-w3c: convert Google Docs to HTML
Browse files Browse the repository at this point in the history
The script converted Google Docs to PDF using Puppeteer. But Google Docs can
be exported to self-contained HTML pages instead (and Puppeteer is not needed
for that). This update downloads HTML versions of minutes taken in Google Docs
so that they may imported to W3C's web sites.

The script also returns the Markdown updates that need to be made to the issues
to link to the new minutes (and keep a pointer to the initial ones).

Notes:
- The script is not (yet) smart enough to udpate the issues itself.
- The script takes a second parameter: the prefix to use for the new link
on W3C's web site, e.g., `https://www.w3.org/2024/09/breakouts`.
  • Loading branch information
tidoust committed Nov 6, 2024
1 parent d61dc29 commit 59809ab
Showing 1 changed file with 38 additions and 32 deletions.
70 changes: 38 additions & 32 deletions tools/minutes-to-w3c.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
import { getEnvKey } from './lib/envkeys.mjs';
import { fetchProject } from './lib/project.mjs'
import { validateSession } from './lib/validate.mjs';
import puppeteer from 'puppeteer';
import fs from 'node:fs/promises';

async function main(number) {
async function main(number, minutesPrefix) {
const PROJECT_OWNER = await getEnvKey('PROJECT_OWNER', 'w3c');
const PROJECT_NUMBER = await getEnvKey('PROJECT_NUMBER');
console.log();
console.log(`Retrieve project ${PROJECT_OWNER}/${PROJECT_NUMBER}...`);
console.warn();
console.warn(`Retrieve project ${PROJECT_OWNER}/${PROJECT_NUMBER}...`);
const project = await fetchProject(PROJECT_OWNER, PROJECT_NUMBER);
if (!project) {
throw new Error(`Project ${PROJECT_OWNER}/${PROJECT_NUMBER} could not be retrieved`);
Expand All @@ -33,7 +33,7 @@ async function main(number) {
}
}
else {
console.log(`- found ${sessions.length} sessions assigned to a slot and room`);
console.warn(`- found ${sessions.length} sessions assigned to a slot and room`);
}
sessions = await Promise.all(sessions.map(async session => {
const sessionErrors = (await validateSession(session.number, project))
Expand All @@ -45,45 +45,51 @@ async function main(number) {
if (sessions.length === 0) {
throw new Error(`Session ${number} contains errors that need fixing`);
}
else if (sessions[0].description.materials.minutes) {
console.log("Session " + number + ": " + sessions[0].description.materials.minutes);
return;
}
}
for (const session of sessions.filter(s => s.description?.materials?.minutes)) {
const url = session.description.materials.minutes;
if (url.match(/w3\.org|\@\@/)) {
console.warn(`- ${session.number}: skipping minutes at ${url}`);
}
else if (url.match(/docs\.google\.com/)) {
console.warn(`- ${session.number}: processing minutes at ${url}`);
console.log(`=====`);
console.log(`Session #${session.number} ${session.title}`);
console.log(`https://github.com/${session.repository}/issues/${session.number}`);
const file = `minutes-${session.number}.html`;
const exportUrl = url.match(/\/edit.*$/) ?
url.replace(/\/edit.*$/, '/export') :
url + '/export';
const res = await fetch(exportUrl);

// Links in Google Docs are prefixed, and additional styles may be
// imported. Get rid of that to avoid relying on third-party servers.
let html = await res.text();
html = html
.replace(/@import url\(https:\/\/themes\.googleusercontent\.com\/[^\)]*\);/g, '')
.replace(/href="https:\/\/www\.google\.com\/url\?q=([^&]+)&[^"]+"/g, 'href="$1"');
await fs.writeFile(file, html, 'utf8');
console.log(`- [Minutes](${minutesPrefix}/minutes-${session.number}.html)`);
console.log(`- [Minutes - initial Google doc](${url})`);
}
else {
for (const session of sessions.filter(s => s.description.materials.minutes)) {
const url = session.description.materials.minutes;
if (url.match(/w3\.org|\@\@/)) {
console.log("Skipping " + session.number + ": " + url);
} else if (url.match(/docs\.google\.com/)) {
console.log(session.number + ": " + session.description.materials.minutes);
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
await page.pdf({
path: session.number + '-minutes.pdf',
});
await browser.close();
})();
} else {
console.log("Manually get: " + session.number + ": " + session.description.materials.minutes);
}
}
console.warn(`- ${session.number}: need to get minutes at ${url}`);
}
}
console.log(`Retrieve project ${PROJECT_OWNER}/${PROJECT_NUMBER} and session(s)... done`);
console.warn(`Retrieve project ${PROJECT_OWNER}/${PROJECT_NUMBER} and session(s)... done`);
}


// Read session number from command-line
if (process.argv[2] && !process.argv[2].match(/^(\d+|all)$/)) {
console.log('First parameter should be a session number or "all"');
console.warn('First parameter should be a session number or "all"');
process.exit(1);
}
const sessionNumber = process.argv[2]?.match(/^\d+$/) ? parseInt(process.argv[2], 10) : undefined;
const minutesPrefix = process.argv[3];

main(sessionNumber)
main(sessionNumber, minutesPrefix)
.catch(err => {
console.log(`Something went wrong: ${err.message}`);
console.error(`Something went wrong: ${err.message}`);
throw err;
});

0 comments on commit 59809ab

Please sign in to comment.