From f9e0dc72c3d6636dfd06dc1e09bd9f8244f52685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B3nio=20Afonso?= Date: Mon, 17 Jun 2024 18:01:43 -0700 Subject: [PATCH] Also detect windows-1250 every time windows-1252 is detected This is in relation to https://github.com/aadsm/jschardet/issues/70. The differences between the two are minimal, so this will be a workaround for now. These two encodings use very different models for detection. The windows-1252 detector is purely based on the occurance probability of each character's class. The windows-1250 uses a Hungarian language model to detect the text is in Hungarian. This is brittle as there are other languages using windows-1250. --- src/universaldetector.js | 31 ++++++++++++++++++++++++++----- tests/detectEncodings.test.js | 10 ++++++++++ tests/encodings.test.js | 5 +++-- tests/fixtures/windows-1250.txt | 1 + 4 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 tests/fixtures/windows-1250.txt diff --git a/src/universaldetector.js b/src/universaldetector.js index 3f64c60..c8cf302 100644 --- a/src/universaldetector.js +++ b/src/universaldetector.js @@ -235,14 +235,35 @@ function UniversalDetector(options) { return this.result; } - if( this._mInputState == _state.highbyte ) { - for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) { - if( !prober || !prober.getCharsetName() || !canDetectEncoding(prober.getCharsetName()) ) continue; + if (this._mInputState == _state.highbyte) { + let windows_1252_confidence = 0; + let windows_1250_detected = false; + for (var i = 0, prober; prober = this._mCharsetProbers[i]; i++) { + if (!prober) continue; + const charsetName = prober.getCharsetName(); + const confidence = prober.getConfidence(); + if (prober.getCharsetName() === "windows-1252") { + windows_1252_confidence = confidence; + } + if (!charsetName || !canDetectEncoding(charsetName)) continue; this.results.push({ "encoding": prober.getCharsetName(), - "confidence": prober.getConfidence() + "confidence": confidence + }); + if (prober.getCharsetName() === "windows-1250") { + windows_1250_detected = true; + } + logger.log(prober.getCharsetName() + " confidence " + confidence); + } + // HACK: When windows-1252 is detected it's almost sure that it can + // also be windows-1250. + // https://en.wikipedia.org/wiki/Windows-1250 (Central European) + if (windows_1252_confidence && !windows_1250_detected && canDetectEncoding("windows-1250")) { + this.results.push({ + "encoding": "windows-1250", + // Report the confidence just a bit under windows-1252's. + "confidence": windows_1252_confidence - 5/10**(String(windows_1252_confidence).length - 1), }); - logger.log(prober.getCharsetName() + " confidence " + prober.getConfidence()); } this.results.sort(function(a, b) { return b.confidence - a.confidence; diff --git a/tests/detectEncodings.test.js b/tests/detectEncodings.test.js index 471d5f7..edef68b 100644 --- a/tests/detectEncodings.test.js +++ b/tests/detectEncodings.test.js @@ -50,3 +50,13 @@ test('detectEncodings locks down which encodings to detect (SHIFT_JIS)', async ( }); expect(shortSingleEncoding.encoding).toBe("SHIFT_JIS") }); + +test('detectEncodings should also report windows-1250 when it detects windows-1252', async () => { + const fixturePath = `${__dirname}/fixtures/windows-1250.txt`; + fileContents = await utils.readFileAsBuffer(fixturePath); + const possibleEncodings = jschardet.detectAll(fileContents, { + detectEncodings: ["windows-1250"], + }); + expect(possibleEncodings.length).toBe(1); + expect(possibleEncodings[0].encoding).toBe("windows-1250") +}); diff --git a/tests/encodings.test.js b/tests/encodings.test.js index d180e41..de6f43c 100644 --- a/tests/encodings.test.js +++ b/tests/encodings.test.js @@ -246,8 +246,9 @@ describe.skip("Not sure how to test these", function() { }); test("windows-1250 (Hungarian)", function() { - // 🤷 - var str = ""; + // Příliš žluťoučký kůň úpěl ďábelské ódy + var str = "\x50\xf8\xed\x6c\x69\x9a\x20\x9e\x6c\x75\x9d\x6f\x75\xe8\x6b\xfd\x20\x6b\xf9\xf2\x20\xfa\x70\xec\x6c\x20\xef\xe1\x62\x65\x6c\x73\x6b\xe9\x20\xf3\x64\x79"; + // It's reporting IBM866 for some reason.. expect(jschardet.detect(str).encoding).toBe("windows-1250") }); diff --git a/tests/fixtures/windows-1250.txt b/tests/fixtures/windows-1250.txt new file mode 100644 index 0000000..e92cf1b --- /dev/null +++ b/tests/fixtures/windows-1250.txt @@ -0,0 +1 @@ +Pli luouk k pl belsk dy \ No newline at end of file