Skip to content

Commit

Permalink
Also detect windows-1250 every time windows-1252 is detected
Browse files Browse the repository at this point in the history
This is in relation to #70. The differences between the two are minimal, so this will be a workaround for now. These two encodings use very different models for detection. The windows-1252 detector is purely based on the occurance probability of each character's class. The windows-1250 uses a Hungarian language model to detect the text is in Hungarian. This is brittle as there are other languages using windows-1250.
  • Loading branch information
aadsm committed Jun 18, 2024
1 parent f42d262 commit f9e0dc7
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 7 deletions.
31 changes: 26 additions & 5 deletions src/universaldetector.js
Original file line number Diff line number Diff line change
Expand Up @@ -235,14 +235,35 @@ function UniversalDetector(options) {
return this.result;
}

if( this._mInputState == _state.highbyte ) {
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
if( !prober || !prober.getCharsetName() || !canDetectEncoding(prober.getCharsetName()) ) continue;
if (this._mInputState == _state.highbyte) {
let windows_1252_confidence = 0;
let windows_1250_detected = false;
for (var i = 0, prober; prober = this._mCharsetProbers[i]; i++) {
if (!prober) continue;
const charsetName = prober.getCharsetName();
const confidence = prober.getConfidence();
if (prober.getCharsetName() === "windows-1252") {
windows_1252_confidence = confidence;
}
if (!charsetName || !canDetectEncoding(charsetName)) continue;
this.results.push({
"encoding": prober.getCharsetName(),
"confidence": prober.getConfidence()
"confidence": confidence
});
if (prober.getCharsetName() === "windows-1250") {
windows_1250_detected = true;
}
logger.log(prober.getCharsetName() + " confidence " + confidence);
}
// HACK: When windows-1252 is detected it's almost sure that it can
// also be windows-1250.
// https://en.wikipedia.org/wiki/Windows-1250 (Central European)
if (windows_1252_confidence && !windows_1250_detected && canDetectEncoding("windows-1250")) {
this.results.push({
"encoding": "windows-1250",
// Report the confidence just a bit under windows-1252's.
"confidence": windows_1252_confidence - 5/10**(String(windows_1252_confidence).length - 1),
});
logger.log(prober.getCharsetName() + " confidence " + prober.getConfidence());
}
this.results.sort(function(a, b) {
return b.confidence - a.confidence;
Expand Down
10 changes: 10 additions & 0 deletions tests/detectEncodings.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,13 @@ test('detectEncodings locks down which encodings to detect (SHIFT_JIS)', async (
});
expect(shortSingleEncoding.encoding).toBe("SHIFT_JIS")
});

test('detectEncodings should also report windows-1250 when it detects windows-1252', async () => {
const fixturePath = `${__dirname}/fixtures/windows-1250.txt`;
fileContents = await utils.readFileAsBuffer(fixturePath);
const possibleEncodings = jschardet.detectAll(fileContents, {
detectEncodings: ["windows-1250"],
});
expect(possibleEncodings.length).toBe(1);
expect(possibleEncodings[0].encoding).toBe("windows-1250")
});
5 changes: 3 additions & 2 deletions tests/encodings.test.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions tests/fixtures/windows-1250.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
P��li� �lu�ou�k� k�� �p�l ��belsk� �dy

0 comments on commit f9e0dc7

Please sign in to comment.