liblcf
Loading...
Searching...
No Matches
reader_util.cpp
Go to the documentation of this file.
1/*
2 * This file is part of liblcf. Copyright (c) liblcf authors.
3 * https://github.com/EasyRPG/liblcf - https://easyrpg.org
4 *
5 * liblcf is Free/Libre Open Source Software, released under the MIT License.
6 * For the full copyright and license information, please view the COPYING
7 * file that was distributed with this source code.
8 */
9
10#include "lcf/config.h"
11#include "lcf/scope_guard.h"
12
13#if LCF_SUPPORT_ICU == 1
14# include <unicode/ucsdet.h>
15# include <unicode/ucnv.h>
16# include <unicode/normalizer2.h>
17# include <unicode/ustring.h>
18#elif LCF_SUPPORT_ICU == 2
19# ifndef _WIN32
20# error "icu.h only supported on Windows"
21# endif
22# include <icu.h>
23#endif
24
25#ifdef _WIN32
26# include <windows.h>
27#else
28# include <locale>
29#endif
30
31#include <algorithm>
32#include <cstdio>
33#include <cstdlib>
34#include <sstream>
35#include <vector>
36
37#include "lcf/encoder.h"
38#include "lcf/inireader.h"
39#include "lcf/ldb/reader.h"
40#include "lcf/reader_util.h"
41#include "log.h"
42
43namespace lcf {
44
45namespace ReaderUtil {
46}
47
48std::string ReaderUtil::CodepageToEncoding(int codepage) {
49 if (codepage == 0)
50 return {};
51
52 if (codepage == 932) {
53 return "ibm-943_P15A-2003";
54 }
55 if (codepage == 949) {
56 return "windows-949-2000";
57 }
58
59 return "windows-" + std::to_string(codepage);
60}
61
62std::string ReaderUtil::DetectEncoding(lcf::rpg::Database& db) {
63 std::vector<std::string> encodings = DetectEncodings(db);
64
65 if (encodings.empty()) {
66 return {};
67 }
68
69 return encodings.front();
70}
71
72std::vector<std::string> ReaderUtil::DetectEncodings(lcf::rpg::Database& db) {
73#if LCF_SUPPORT_ICU
74 std::ostringstream text;
75
76 auto append = [](const auto& s) {
77 return ToString(s) + " ";
78 };
79
80 lcf::rpg::ForEachString(db.system, [&](const auto& val, const auto&) {
81 text << append(val);
82 });
83
84 // Cannot use ForEachString here for Terms:
85 // Too much untranslated garbage data in there, even in default database
86 for (const auto& s: {
87 db.terms.menu_save,
88 db.terms.menu_quit,
89 db.terms.new_game,
90 db.terms.load_game,
91 db.terms.exit_game,
92 db.terms.status,
93 db.terms.row,
94 db.terms.order,
95 db.terms.wait_on,
96 db.terms.wait_off,
97 db.terms.level,
98 db.terms.health_points,
99 db.terms.spirit_points,
100 db.terms.normal_status,
101 db.terms.sp_cost,
102 db.terms.attack,
103 db.terms.defense,
104 db.terms.spirit,
105 db.terms.agility,
106 db.terms.weapon,
107 db.terms.shield,
108 db.terms.armor,
109 db.terms.helmet,
110 db.terms.accessory,
111 db.terms.save_game_message,
112 db.terms.load_game_message,
113 db.terms.exit_game_message,
114 db.terms.file,
115 db.terms.yes,
116 db.terms.no
117 }) {
118 text << append(s);
119 }
120
121 return ReaderUtil::DetectEncodings(text.str());
122#else
123 return {"windows-1252"};
124#endif
125}
126
127std::string ReaderUtil::DetectEncoding(std::string_view string) {
128 std::vector<std::string> encodings = DetectEncodings(string);
129
130 if (encodings.empty()) {
131 return {};
132 }
133
134 return encodings.front();
135}
136
137std::vector<std::string> ReaderUtil::DetectEncodings(std::string_view string) {
138 std::vector<std::string> encodings;
139#if LCF_SUPPORT_ICU
140 if (!string.empty()) {
141 UErrorCode status = U_ZERO_ERROR;
142 UCharsetDetector* detector = ucsdet_open(&status);
143
144 auto s = std::string(string);
145
146 int confidence = 0;
147 int32_t matches_count;
148 const UCharsetMatch** matches = nullptr;
149
150 while (true) {
151 ucsdet_setText(detector, s.c_str(), s.length(), &status);
152 matches = ucsdet_detectAll(detector, &matches_count, &status);
153
154 if (!matches || matches_count < 1) {
155 break;
156 }
157
158 confidence = ucsdet_getConfidence(matches[0], &status);
159
160 if (confidence > 70 || s.length() > 100) {
161 break;
162 }
163
164 // Concatenating the string to itself increases the confidence (for short strings)
165 s += s;
166 }
167
168 if (matches != nullptr) {
169 // Collect all candidates, most confident comes first
170
171 for (int i = 0; i < matches_count; ++i) {
172 std::string encoding = ucsdet_getName(matches[i], &status);
173
174 // Fixes to ensure proper Windows encodings
175 if (encoding == "Shift_JIS") {
176 encodings.emplace_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
177 } else if (encoding == "EUC-KR") {
178 encodings.emplace_back("windows-949-2000"); // Korean with \ as backlash
179 } else if (encoding == "GB18030") {
180 encodings.emplace_back("windows-936-2000"); // Simplified Chinese
181 } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
182 encodings.emplace_back("ibm-5348_P100-1997"); // Occidental with Euro
183 } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
184 encodings.emplace_back("ibm-5346_P100-1998"); // Central Europe with Euro
185 } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
186 encodings.emplace_back("ibm-5347_P100-1998"); // Cyrillic with Euro
187 } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
188 encodings.emplace_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
189 } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
190 encodings.emplace_back("ibm-5349_P100-1998"); // Greek with Euro
191 } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
192 encodings.emplace_back("ibm-9447_P100-2002"); // Hebrew with Euro
193 } else if (encoding == "UTF-16BE" || encoding == "UTF-16LE") {
194 // ignore encodings that are obviously wrong
195 } else {
196 encodings.push_back(encoding);
197 }
198 }
199 }
200 ucsdet_close(detector);
201 }
202#else
203 encodings.push_back("windows-1252");
204#endif
205
206 return encodings;
207}
208
209std::string ReaderUtil::GetEncoding(std::string_view ini_file) {
210#if LCF_SUPPORT_INI
211 INIReader ini(ToString(ini_file));
212 if (ini.ParseError() != -1) {
213 auto encoding = ini.Get("EasyRPG", "Encoding", "");
214 if (!encoding.empty()) {
215 return ReaderUtil::CodepageToEncoding(atoi(std::string(encoding).c_str()));
216 }
217 }
218#else
219 Log::Warning("Could not get encoding from ini file, disabled in this liblcf build.");
220#endif
221 return {};
222}
223
224std::string ReaderUtil::GetEncoding(std::istream& filestream) {
225#if LCF_SUPPORT_INI
226 INIReader ini(filestream);
227 if (ini.ParseError() != -1) {
228 auto encoding = ini.Get("EasyRPG", "Encoding", "");
229 if (!encoding.empty()) {
230 return ReaderUtil::CodepageToEncoding(atoi(std::string(encoding).c_str()));
231 }
232 }
233#else
234 Log::Warning("Could not get encoding from ini file, disabled in this liblcf build.");
235#endif
236 return {};
237}
238
239std::string ReaderUtil::GetLocaleEncoding() {
240#ifdef _WIN32
241 int codepage = GetACP();
242#elif __ANDROID__
243 // No std::locale support in NDK
244 // Doesn't really matter because the Android version auto-detects via ICU
245 int codepage = 1252;
246#else
247 int codepage = 1252;
248
249 std::locale loc = std::locale("");
250 // Gets the language and culture part only
251 std::string loc_full = loc.name().substr(0, loc.name().find_first_of("@."));
252 // Gets the language part only
253 std::string loc_lang = loc.name().substr(0, loc.name().find_first_of("_"));
254
255 if (loc_lang == "th") codepage = 874;
256 else if (loc_lang == "ja") codepage = 932;
257 else if (loc_full == "zh_CN" ||
258 loc_full == "zh_SG") codepage = 936;
259 else if (loc_lang == "ko") codepage = 949;
260 else if (loc_full == "zh_TW" ||
261 loc_full == "zh_HK") codepage = 950;
262 else if (loc_lang == "cs" ||
263 loc_lang == "hu" ||
264 loc_lang == "pl" ||
265 loc_lang == "ro" ||
266 loc_lang == "hr" ||
267 loc_lang == "sk" ||
268 loc_lang == "sl") codepage = 1250;
269 else if (loc_lang == "ru") codepage = 1251;
270 else if (loc_lang == "ca" ||
271 loc_lang == "da" ||
272 loc_lang == "de" ||
273 loc_lang == "en" ||
274 loc_lang == "es" ||
275 loc_lang == "fi" ||
276 loc_lang == "fr" ||
277 loc_lang == "it" ||
278 loc_lang == "nl" ||
279 loc_lang == "nb" ||
280 loc_lang == "pt" ||
281 loc_lang == "sv" ||
282 loc_lang == "eu") codepage = 1252;
283 else if (loc_lang == "el") codepage = 1253;
284 else if (loc_lang == "tr") codepage = 1254;
285 else if (loc_lang == "he") codepage = 1255;
286 else if (loc_lang == "ar") codepage = 1256;
287 else if (loc_lang == "et" ||
288 loc_lang == "lt" ||
289 loc_lang == "lv") codepage = 1257;
290 else if (loc_lang == "vi") codepage = 1258;
291#endif
292
293 return CodepageToEncoding(codepage);
294}
295
296std::string ReaderUtil::Recode(std::string_view str_to_encode, std::string_view source_encoding) {
297 lcf::Encoder enc(ToString(source_encoding));
298 std::string out = ToString(str_to_encode);
299 enc.Encode(out);
300 return out;
301}
302
303std::string ReaderUtil::Normalize(std::string_view str) {
304 if (str.empty()) {
305 return {};
306 }
307
308#if LCF_SUPPORT_ICU
309 UErrorCode err = U_ZERO_ERROR;
310
311 auto log_warning = [err, &str](const char* func_name) {
312 Log::Error("%s failed while normalizing \"%s\": %s", func_name, std::string(str).c_str(), u_errorName(err));
313 return std::string(str);
314 };
315
316 std::vector<UChar> uni(str.length() + 1); // including \0
317 int32_t uni_length; // length in utf-16
318 u_strFromUTF8Lenient(uni.data(), uni.size(), &uni_length, str.data(), str.length(), &err);
319 if (U_FAILURE(err)) {
320 return log_warning("u_strFromUTF8Lenient");
321 }
322
323 uni_length = u_strToLower(uni.data(), uni.size(), uni.data(), uni_length, "", &err);
324 if (U_FAILURE(err)) {
325 return log_warning("u_strToLower");
326 }
327
328 std::vector<char> res;
329 int res_capac = uni.size() * 4 + 1; // a codepoint in utf-8 is at most 4 bytes
330 res.resize(res_capac);
331
332 const UNormalizer2* norm = unorm2_getNFKCInstance(&err);
333 if (U_FAILURE(err)) {
334 static bool err_reported = false;
335 if (!err_reported) {
336 Log::Error("Normalizer2::getNFKCInstance failed (%s). \"nrm\" is probably missing in the ICU data file. Unicode normalization will not work!", u_errorName(err));
337 err_reported = true;
338 }
339 err = U_ZERO_ERROR;
340
341 // error handling: return the lowercased string
342 u_strToUTF8(res.data(), res_capac, &uni_length, uni.data(), uni_length, &err);
343 if (U_FAILURE(err)) {
344 return log_warning("u_strToUTF8 (1)");
345 }
346
347 return std::string(res.data(), uni_length);
348 }
349
350 std::vector<UChar> uni_norm(uni_length * 2 + 1); // * 2 for cases where the normalization is larger than the input
351 auto uni_norm_length = unorm2_normalize(norm, uni.data(), uni_length, uni_norm.data(), uni_norm.size(), &err);
352
353 if (U_FAILURE(err)) {
354 log_warning("unorm2_normalize");
355
356 err = U_ZERO_ERROR;
357
358 // error handling: return the lowercased string
359 u_strToUTF8(res.data(), res_capac, &uni_length, uni.data(), uni_length, &err);
360 if (U_FAILURE(err)) {
361 return log_warning("u_strToUTF8 (2)");
362 }
363 } else {
364 // success: return the lowercased and normalized string
365 u_strToUTF8(res.data(), res_capac, &uni_length, uni_norm.data(), uni_norm_length, &err);
366 if (U_FAILURE(err)) {
367 return log_warning("u_strToUTF8 (3)");
368 }
369 }
370
371 return std::string(res.data(), uni_length);
372#else
373 auto result = std::string(str);
374 std::transform(result.begin(), result.end(), result.begin(), tolower);
375 return result;
376#endif
377}
378
379} //namespace lcf
void Warning(const char *fmt,...) LIKE_PRINTF
void Error(const char *fmt,...) LIKE_PRINTF