forked from ms609/citation-bot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
constants.php
92 lines (78 loc) · 3.52 KB
/
constants.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
<?php
declare(strict_types=1);
// @codeCoverageIgnoreStart
// all new constant files needed listed here
require_once 'constants/bad_data.php';
require_once 'constants/capitalization.php';
require_once 'constants/math.php';
require_once 'constants/mistakes.php';
require_once 'constants/parameters.php';
require_once 'constants/regular_expressions.php';
require_once 'constants/italics.php';
require_once 'constants/isbn.php';
require_once 'constants/null_good_doi.php';
require_once 'constants/null_bad_doi.php';
// @codeCoverageIgnoreEnd
const PIPE_PLACEHOLDER = '# # # CITATION_BOT_PLACEHOLDER_PIPE # # #';
const TEMP_PLACEHOLDER = "# # # CITATION_BOT_PLACEHOLDER_TEMPORARY %s # # #";
//Common replacements
const HTML_DECODE = ["[", "]", "<", ">", " "];
const HTML_ENCODE = ["[", "]", "<", ">", "+"];
const HTML_DECODE_DOI = ["[", "]", "<", ">"];
const HTML_ENCODE_DOI = ["[", "]", "<", ">"];
const DOT_ENCODE = [".2F", ".5B", ".7B", ".7D", ".5D", ".3C", ".3E", ".3B", ".28", ".29"];
const DOT_DECODE = ["/", "[", "{", "}", "]", "<", ">", ";", "(", ")"];
const DOI_URL_ENCODE = ["%23", "%3C", "%3E"];
const DOI_URL_DECODE = ["#", "<", ">"];
const DATES_WHATEVER = 0;
const DATES_MDY = 1;
const DATES_DMY = 2;
const NAME_LIST_STYLE_DEFAULT = 0;
const NAME_LIST_STYLE_AMP = 1;
const NAME_LIST_STYLE_VANC = 2;
const COMMONUSERNAME = '[email protected]';
const CROSSREFUSERNAME = '[email protected]';
const PUBMEDUSERNAME = '[email protected]';
const BOT_CROSSREF_USER_AGENT = "Mozilla/5.0 (compatible; Citation_bot; mailto:".CROSSREFUSERNAME."; +https://citations.toolforge.org/)";
const BOT_USER_AGENT = "Mozilla/5.0 (compatible; Citation_bot; mailto:".COMMONUSERNAME ."; +https://citations.toolforge.org/)";
const BOT_HTTP_TIMEOUT = 20;
const BOT_CONNECTION_TIMEOUT = 10;
function curl_limit_page_size(CurlHandle $_ch, int $_DE = 0, int $down = 0, int $_UE = 0, int $_Up = 0): int {
// MOST things are sane, some things are stupidly large like S2 json data or archived PDFs
// If $down exceeds max-size of 128MB, returning non-0 breaks the connection!
if ($down > 134217728) {
bot_debug_log("Absurdly large curl");
return 1;
}
return 0;
}
/** @param array<int, int|string|bool|array<int, string>> $ops */
function bot_curl_init(float $time, array $ops): CurlHandle {
$ch = curl_init();
// 1 - Global Defaults
curl_setopt_array($ch, [
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_BUFFERSIZE => 524288, // 512kB chunks
CURLOPT_MAXREDIRS => 20, // No infinite loops for us, 20 for Elsevier and Springer websites
CURLOPT_USERAGENT => BOT_USER_AGENT,
CURLOPT_AUTOREFERER => "1",
CURLOPT_REFERER => "https://en.wikipedia.org",
CURLOPT_COOKIESESSION => "1",
CURLOPT_RETURNTRANSFER => "1",
CURLOPT_HEADEROPT => CURLHEADER_UNIFIED,
CURLOPT_PROGRESSFUNCTION => 'curl_limit_page_size',
CURLOPT_NOPROGRESS => "0",
CURLOPT_COOKIEJAR => 'cookie.txt', // Needed for proquest
CURLOPT_COOKIEFILE => 'cookie.txt', // Needed for proquest
// 2 - Default Time by ratio
CURLOPT_TIMEOUT => BOT_HTTP_TIMEOUT * $time,
CURLOPT_CONNECTTIMEOUT => BOT_CONNECTION_TIMEOUT * $time,
]);
// 3 - Specific options and overrides of defaults
curl_setopt_array($ch, $ops);
return $ch;
}
function bot_curl_exec(CurlHandle $ch): string {
curl_setopt($ch, CURLOPT_REFERER, WIKI_ROOT . "title=" . Page::$last_title);
return (string) @curl_exec($ch);
}