Skip to content

Commit

Permalink
Add CDDL validation using cddlparser and pyodide (#837)
Browse files Browse the repository at this point in the history
There is no immediate parser in Node.js that could be used to validate CDDL
extracts. Instead, the idea is to run the cddlparser Python package using a
Python interpreter running in WebAssembly.

This provides a CDDL validation guarantee for Webref.

Some notes:
- The version of cddlparser used by Strudy is specified in the
`requirements.txt` file. Goal is for Dependabot to create a pull request
automatically when a new version becomes available.
- Loading could be optimized not to depend on `micropip` at all, but that
requires hardcoding the Wheel URL of the cddlparser package somehow.
- The line position in the anomalies report needs to be adjusted to account for
the header that Reffy adds and that gets stripped during "expansion". That's
hardcoded for now. I'd say that's good enough...
- The CDDL parser stops as soon as it encounters an error, the list of
anomalies only contains the first problem as a result.
- The CDDL parser does not validate additional CDDL semantics for now.
  • Loading branch information
tidoust authored Dec 19, 2024
1 parent c7bc0a8 commit ae96bd5
Show file tree
Hide file tree
Showing 7 changed files with 249 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,9 @@ updates:
interval: daily
time: '09:00'
open-pull-requests-limit: 10
- package-ecosystem: pip
directory: "/"
schedule:
interval: daily
time: '09:00'
open-pull-requests-limit: 10
21 changes: 21 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"commander": "12.1.0",
"gray-matter": "^4.0.3",
"jsdom": "^25.0.1",
"pyodide": "^0.26.4",
"reffy": "^18.0.1",
"semver": "^7.3.5",
"webidl2": "^24.2.2"
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# The CDDL parser used by Strudy is written in Python.
# This file sets the version of the CDDL parser run by Strudy and makes it
# possible for Dependabot to detect and propose updates.
cddlparser==0.4.0
107 changes: 107 additions & 0 deletions src/lib/study-cddl.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/**
* Analyze the CDDL extracted during a crawl and create an anomalies report.
*
* The CDDL parser surrenders as soon as it encounters an error. As such, the
* anomalies report will contain at most one anomaly per CDDL module, even
* though there may be many more problems with the CDDL.
*
* Note: for specs that define multiple CDDL modules, anomalies will also be
* reported for the "all" module. That may appear as duplicating the errors,
* but will be useful to detect rules that get defined with the same name in
* two different modules (once the CDDL parser actually detects that).
*
* The analysis makes no attempt for now at assembling CDDL modules that
* are part of the same namespace. For that to become possible, the recipe to
* follow to create the namespaces out of modules will have to be specified
* somehow.
*/

import { loadPyodide } from 'pyodide';
import fs from 'node:fs/promises';
import path from 'node:path';
import { fileURLToPath } from "node:url";

const scriptPath = path.dirname(fileURLToPath(import.meta.url));

// Python WebAssembly environment (initialized once)
let pyodide = null;

async function studyCddl (specs, { crawledResults = [] } = {}) {
const report = []; // List of anomalies to report

// Setup Python WebAssembly environment
if (!pyodide) {
pyodide = await loadPyodide();

// Retrieve the version of the cddlparser package to install from
// the "requirements.txt" file at the root of the project
const file = path.join(scriptPath, '..', '..', 'requirements.txt');
const contents = await fs.readFile(file, 'utf8');
const version = contents.match(/^cddlparser==(.*)/m)[1];

// Load micropip to "install" the cddlparser package.
// An alternative would be to `loadPackage` cddlparser from a Wheel URL
// directly to avoid having to install micropip. The Wheel URL in Pypi does
// not seem predictable. But the one for GitHub releases could work, e.g.:
// https://github.com/tidoust/cddlparser/releases/download/v0.4.0/cddlparser-0.4.0-py3-none-any.whl
// This would make loading slightly faster and avoid introducing yet
// another dependency. But it would hardcode the cddlparser repository name
// whereas the project could perhaps be moved in the future.
await pyodide.loadPackage('micropip');
const micropip = pyodide.pyimport('micropip');
await micropip.install(`cddlparser==${version}`);
}

// Python will report execution results to the standard output
let stdout = '';
pyodide.setStdout({
batched: str => stdout += str + '\n'
});

for (const spec of specs) {
for (const cddlModule of (spec.cddl ?? [])) {
stdout = '';
await pyodide.runPythonAsync(`
from cddlparser import parse
try:
parse('''${cddlModule.cddl}''')
except Exception as err:
print(err)
`);
stdout = stdout.trim();

// If nothing was reported, that means parsing succeeded, CDDL is valid
if (!stdout) {
continue;
}

let message = cddlModule.name ?
`In the "${cddlModule.name}" module, ` :
'';
message += stdout;

// The parser reports the line number at which the error occurred... but
// it gets run against the expanded extract, and expansion drops the
// header that Reffy adds to the extracts to note the provenance. The line
// number is off by 5 as a result. Let's fix that. It's certainly not
// fantastic to hardcode the number of lines that compose the header, but
// let's say that's good enough for now!
const reLineNumber = / line (\d+):/;
const match = message.match(reLineNumber);
if (match) {
let lineNumber = parseInt(match[1], 10);
lineNumber += 5;
message = message.replace(reLineNumber, ` line ${lineNumber}:`);
}

report.push({
name: 'invalidCddl',
message,
spec
});
}
}
return report;
}

export default studyCddl;
14 changes: 14 additions & 0 deletions src/lib/study.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import studyCddl from './study-cddl.js';
import studyDfns from './study-dfns.js';
import studyAlgorithms from './study-algorithms.js';
import studyBackrefs from './study-backrefs.js';
Expand Down Expand Up @@ -187,6 +188,19 @@ const anomalyGroups = [
study: studyWebIdl,
studyParams: ['curated'],
cc: true
},

{
name: 'cddl',
title: 'Problems with CDDL',
description: 'The following problems were identified when analyzing CDDL definitions',
types: [
{
name: 'invalidCddl',
title: 'Invalid CDDL syntax'
}
],
study: studyCddl
}
];

Expand Down
96 changes: 96 additions & 0 deletions test/study-cddl.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/**
* Tests CDDL analysis.
*
* Note line numbers in reported anomalies are off by 5. That's normal, the
* analysis compensates for the header that Reffy adds to the CDDL extracts
* and that isn't included in the tests.
*/
/* global describe, it */

import study from '../src/lib/study-cddl.js';
import { assertNbAnomalies, assertAnomaly } from './util.js';

describe('The CDDL analyser', function () {
this.timeout(5000);

const specUrl = 'https://www.w3.org/TR/spec';
const specUrl2 = 'https://www.w3.org/TR/spec2';

function toCrawlResult (cddl, cddl2) {
const crawlResult = [];

function addSpecAndCddl(url, cddl) {
if (typeof cddl === 'string') {
crawlResult.push({ url, cddl: [{ name: '', cddl }] });
}
else if (Array.isArray(cddl)) {
crawlResult.push({ url, cddl });
}
else {
crawlResult.push({ url, cddl: [cddl] });
}
}

addSpecAndCddl(specUrl, cddl);
if (cddl2) {
addSpecAndCddl(specUrl2, cddl2);
}
return crawlResult;
}

function analyzeCddl (cddl, cddl2) {
const crawlResult = toCrawlResult(cddl, cddl2);
return study(crawlResult);
}

it('reports no anomaly if CDDL is valid', async () => {
const crawlResult = toCrawlResult('cddl = tstr');
const report = await study(crawlResult);
assertNbAnomalies(report, 0);
});

it('reports an anomaly when CDDL is invalid', async () => {
const crawlResult = toCrawlResult('cddl');
const report = await study(crawlResult);
assertNbAnomalies(report, 1);
assertAnomaly(report, 0, {
name: 'invalidCddl',
message: 'CDDL syntax error - line 6: expected assignment (`=`, `/=`, `//=`) after `cddl`, got ``',
spec: { url: specUrl }
});
});

it('reports the CDDL module name', async () => {
const crawlResult = toCrawlResult({
name: 'mo-mo-mo',
cddl: 'nan = 0.'
});
const report = await study(crawlResult);
assertNbAnomalies(report, 1);
assertAnomaly(report, 0, {
name: 'invalidCddl',
message: 'In the "mo-mo-mo" module, CDDL token error - line 6: expected number with fraction to have digits in fraction part, got `0.`',
spec: { url: specUrl }
});
});

it('validates all CDDL modules', async () => {
const crawlResult = toCrawlResult([
{ name: 'all', cddl: 'cddl = tstr\nnobinary = 0b' },
{ name: 'valid', cddl: 'cddl = tstr' },
{ name: 'invalid', cddl: 'nobinary = 0b'}
]);
const report = await study(crawlResult);
assertNbAnomalies(report, 2);
assertAnomaly(report, 0, {
name: 'invalidCddl',
message: 'In the "all" module, CDDL token error - line 7: expected binary number to have binary digits, got `0b`',
spec: { url: specUrl }
});
assertAnomaly(report, 1, {
name: 'invalidCddl',
message: 'In the "invalid" module, CDDL token error - line 6: expected binary number to have binary digits, got `0b`',
spec: { url: specUrl }
});
});
});

0 comments on commit ae96bd5

Please sign in to comment.