348 lines
12 KiB
C++
348 lines
12 KiB
C++
#ifndef NONUT_CORE_STRING_HELPERS_H
|
|
#define NONUT_CORE_STRING_HELPERS_H
|
|
#include <string>
|
|
#include <unordered_map>
|
|
|
|
namespace nonut
|
|
{
|
|
const std::unordered_map<unsigned char, std::string_view> WINDOWS1250_UTF8_MAP
|
|
{
|
|
{'\x80', "\xe2\x82\xac"}, // euro sign
|
|
{'\x82', "\xe2\x80\x9a"}, // lower quotation mark
|
|
{'\x84', "\xe2\x80\x9e"}, // lower quotation marks
|
|
{'\x85', "\xe2\x80\xa6"}, // ellipsis
|
|
{'\x86', "\xe2\x80\xa0"}, // dagger
|
|
{'\x87', "\xe2\x80\xa1"}, // double dagger
|
|
{'\x89', "\xe2\x80\xb0"}, // per mille
|
|
{'\x8a', "\xc5\xa0"}, // S with caron
|
|
{'\x8b', "\xe2\x80\xb9"}, // left guillemet
|
|
{'\x8c', "\xc5\x9a"}, // S with acute
|
|
{'\x8d', "\xc5\xa4"}, // T with caron
|
|
{'\x8e', "\xc5\xbd"}, // Z with caron
|
|
{'\x8f', "\xc5\xb9"}, // Z with acute
|
|
|
|
{'\x91', "\xe2\x80\x98"}, // upper quotation mark (opening)
|
|
{'\x92', "\xe2\x80\x99"}, // upper quotation mark (closing)
|
|
{'\x93', "\xe2\x80\x9c"}, // upper quotation marks (opening)
|
|
{'\x94', "\xe2\x80\x9d"}, // upper quotation marks (closing)
|
|
{'\x95', "\xe2\x80\xa2"}, // bullet sign
|
|
{'\x96', "\xe2\x80\x93"}, // en dash
|
|
{'\x97', "\xe2\x80\x94"}, // em dash
|
|
{'\x99', "\xe2\x84\xa2"}, // trademark sign
|
|
{'\x9a', "\xc5\xa1"}, // s with caron
|
|
{'\x9b', "\xe2\x80\xba"}, // right guillemet
|
|
{'\x9c', "\xc5\x9b"}, // s with acute
|
|
{'\x9d', "\xc5\xa5"}, // t with caron
|
|
{'\x9e', "\xc5\xbe"}, // z with caron
|
|
{'\x9f', "\xc5\xba"}, // z with acute
|
|
|
|
{'\xa0', "\x20"}, // NBSP
|
|
{'\xa1', "\xcb\x87"}, // caron
|
|
{'\xa2', "\xcb\x98"}, // breve
|
|
{'\xa3', "\xc5\x81"}, // L with stroke
|
|
{'\xa4', "\xc2\xa4"}, // currency sign
|
|
{'\xa5', "\xc4\x84"}, // A with ogonek
|
|
{'\xa6', "\xc2\xa6"}, // vertical bar
|
|
{'\xa7', "\xc2\xa7"}, // section sign
|
|
{'\xa8', "\xc2\xa8"}, // diaeresis
|
|
{'\xa9', "\xc2\xa9"}, // copyright sign
|
|
{'\xaa', "\xc5\x9e"}, // S-cedilla
|
|
{'\xab', "\xc2\xab"}, // left guillemets
|
|
{'\xac', "\xc2\xac"}, // negation
|
|
{'\xad', "\xc2\xad"}, // soft hyphen
|
|
{'\xae', "\xc2\xae"}, // registered trademark sign
|
|
{'\xaf', "\xc5\xbb"}, // Z with dot above
|
|
|
|
{'\xb0', "\xc2\xb0"}, // degree sign
|
|
{'\xb1', "\xc2\xb1"}, // plus-minus sign
|
|
{'\xb2', "\xcb\x9b"}, // ogonek
|
|
{'\xb3', "\xc5\x82"}, // l with stroke
|
|
{'\xb4', "\xc2\xb4"}, // acute accent
|
|
{'\xb5', "\xc2\xb5"}, // Mu letter
|
|
{'\xb6', "\xc2\xb6"}, // pilcrow
|
|
{'\xb7', "\xc2\xb7"}, // middle dot
|
|
{'\xb8', "\xc2\xb8"}, // cedilla
|
|
{'\xb9', "\xc4\x85"}, // a with ogonek
|
|
{'\xba', "\xc5\x9f"}, // s-cedilla
|
|
{'\xbb', "\xc2\xbb"}, // right guillemets
|
|
{'\xbc', "\xc4\xbd"}, // Lj-
|
|
{'\xbd', "\xcb\x9d"}, // double acute accent
|
|
{'\xbe', "\xc4\xbe"}, // lj-
|
|
{'\xbf', "\xc5\xbc"}, // z with dot above
|
|
|
|
{'\xc0', "\xc5\x94"}, // R with acute
|
|
{'\xc1', "\xc3\x81"}, // A with acute
|
|
{'\xc2', "\xc3\x82"}, // A-circumflex
|
|
{'\xc3', "\xc4\x82"}, // A-breve
|
|
{'\xc4', "\xc3\x84"}, // A with diaeresis
|
|
{'\xc5', "\xc4\xb9"}, // L with acute
|
|
{'\xc6', "\xc4\x86"}, // C with acute
|
|
{'\xc7', "\xc3\x87"}, // C-cedilla
|
|
{'\xc8', "\xc4\x8c"}, // C with caron
|
|
{'\xc9', "\xc3\x89"}, // C with acute
|
|
{'\xca', "\xc4\x98"}, // E with ogonek
|
|
{'\xcb', "\xc3\x8b"}, // E with diaeresis
|
|
{'\xcc', "\xc3\x8b"}, // E with caron
|
|
{'\xcd', "\xc3\x8d"}, // I with acute
|
|
{'\xce', "\xc3\x8e"}, // I-circumflex
|
|
{'\xcf', "\xc4\x8e"}, // D with caron
|
|
|
|
{'\xd0', "\xc4\x90"}, // crossed D
|
|
{'\xd1', "\xc5\x83"}, // N with acute
|
|
{'\xd2', "\xc5\x87"}, // N with caron
|
|
{'\xd3', "\xc3\x93"}, // O with acute
|
|
{'\xd4', "\xc3\x94"}, // O-circumflex
|
|
{'\xd5', "\xc5\x90"}, // O with dobule accute
|
|
{'\xd6', "\xc3\x96"}, // O with diaeresis
|
|
{'\xd7', "\xc3\x97"}, // multiplication sign
|
|
{'\xd8', "\xc5\x98"}, // R with caron
|
|
{'\xd9', "\xc5\xae"}, // U with diacritic
|
|
{'\xda', "\xc3\x9a"}, // U with acute
|
|
{'\xdb', "\xc5\xb0"}, // U with double accent
|
|
{'\xdc', "\xc3\x9c"}, // U with diaeresis
|
|
{'\xdd', "\xc3\x9d"}, // Y with acute
|
|
{'\xdf', "\xc5\xa2"}, // T-cedilla
|
|
|
|
{'\xe0', "\xc5\x95"}, // r with acute
|
|
{'\xe1', "\xc3\xa1"}, // a with acute
|
|
{'\xe2', "\xc3\xa2"}, // a-circumflex
|
|
{'\xe3', "\xc4\x83"}, // a-breve
|
|
{'\xe4', "\xc3\xa4"}, // a with diaeresis
|
|
{'\xe5', "\xc4\xba"}, // l with acute
|
|
{'\xe6', "\xc4\x87"}, // c with acute
|
|
{'\xe7', "\xc3\xa7"}, // c-cedilla
|
|
{'\xe8', "\xc4\x8d"}, // c with caron
|
|
{'\xe9', "\xc3\xa9"}, // c with acute
|
|
{'\xea', "\xc4\x99"}, // e with ogonek
|
|
{'\xeb', "\xc3\xab"}, // e with diaeresis
|
|
{'\xec', "\xc4\x9b"}, // e with caron
|
|
{'\xed', "\xc3\xad"}, // i with acute
|
|
{'\xee', "\xc3\xae"}, // i-circumflex
|
|
{'\xef', "\xc4\x8f"}, // d with caron
|
|
|
|
{'\xf0', "\xc4\x91"}, // crossed d
|
|
{'\xf1', "\xc5\x84"}, // n with acute
|
|
{'\xf2', "\xc5\x88"}, // n with caron
|
|
{'\xf3', "\xc3\xb3"}, // o with acute
|
|
{'\xf4', "\xc3\xb4"}, // o-circumflex
|
|
{'\xf5', "\xc5\x91"}, // o with double accent
|
|
{'\xf6', "\xc3\xb6"}, // o with diaeresis
|
|
{'\xf7', "\xc3\xb7"}, // division sign
|
|
{'\xf8', "\xc5\x99"}, // r with caron
|
|
{'\xf9', "\xc5\xaf"}, // u with diacritic
|
|
{'\xfa', "\xc3\xba"}, // u with acute
|
|
{'\xfb', "\xc5\xb1"}, // u with double accent
|
|
{'\xfc', "\xc3\xbc"}, // u with diaeresis
|
|
{'\xfd', "\xc3\xbd"}, // y with acute
|
|
{'\xfe', "\xc5\xa3"}, // t-cedilla
|
|
{'\xff', "\xcb\x99"}, // diactric dot
|
|
};
|
|
|
|
const std::unordered_map<std::string_view, unsigned char> UTF8_WINDOWS1250_MAP
|
|
{
|
|
{"\xe2\x82\xac", '\x80'}, // euro sign
|
|
{"\xe2\x80\x9a", '\x82'}, // lower quotation mark
|
|
{"\xe2\x80\x9e", '\x84'}, // lower quotation marks
|
|
{"\xe2\x80\xa6", '\x85'}, // ellipsis
|
|
{"\xe2\x80\xa0", '\x86'}, // dagger
|
|
{"\xe2\x80\xa1", '\x87'}, // double dagger
|
|
{"\xe2\x80\xb0", '\x89'}, // per mille
|
|
{"\xc5\xa0", '\x8a'}, // S with caron
|
|
{"\xe2\x80\xb9", '\x8b'}, // left guillemet
|
|
{"\xc5\x9a", '\x8c'}, // S with acute
|
|
{"\xc5\xa4", '\x8d'}, // T with caron
|
|
{"\xc5\xbd", '\x8e'}, // Z with caron
|
|
{"\xc5\xb9", '\x8f'}, // Z with acute
|
|
|
|
{"\xe2\x80\x98", '\x91'}, // upper quotation mark (opening)
|
|
{"\xe2\x80\x99", '\x92'}, // upper quotation mark (closing)
|
|
{"\xe2\x80\x9c", '\x93'}, // upper quotation marks (opening)
|
|
{"\xe2\x80\x9d", '\x94'}, // upper quotation marks (closing)
|
|
{"\xe2\x80\xa2", '\x95'}, // bullet sign
|
|
{"\xe2\x80\x93", '\x96'}, // en dash
|
|
{"\xe2\x80\x94", '\x97'}, // em dash
|
|
{"\xe2\x84\xa2", '\x99'}, // trademark sign
|
|
{"\xc5\xa1", '\x9a'}, // s with caron
|
|
{"\xe2\x80\xba", '\x9b'}, // right guillemet
|
|
{"\xc5\x9b", '\x9c'}, // s with acute
|
|
{"\xc5\xa5", '\x9d'}, // t with caron
|
|
{"\xc5\xbe", '\x9e'}, // z with caron
|
|
{"\xc5\xba", '\x9f'}, // z with acute
|
|
|
|
{"\x20", '\xa0'}, // NBSP
|
|
{"\xcb\x87", '\xa1'}, // caron
|
|
{"\xcb\x98", '\xa2'}, // breve
|
|
{"\xc5\x81", '\xa3'}, // L with stroke
|
|
{"\xc2\xa4", '\xa4'}, // currency sign
|
|
{"\xc4\x84", '\xa5'}, // A with ogonek
|
|
{"\xc2\xa6", '\xa6'}, // vertical bar
|
|
{"\xc2\xa7", '\xa7'}, // section sign
|
|
{"\xc2\xa8", '\xa8'}, // diaeresis
|
|
{"\xc2\xa9", '\xa9'}, // copyright sign
|
|
{"\xc5\x9e", '\xaa'}, // S-cedilla
|
|
{"\xc2\xab", '\xab'}, // left guillemets
|
|
{"\xc2\xac", '\xac'}, // negation
|
|
{"\xc2\xad", '\xad'}, // soft hyphen
|
|
{"\xc2\xae", '\xae'}, // registered trademark sign
|
|
{"\xc5\xbb", '\xaf'}, // Z with dot above
|
|
|
|
{"\xc2\xb0", '\xb0'}, // degree sign
|
|
{"\xc2\xb1", '\xb1'}, // plus-minus sign
|
|
{"\xcb\x9b", '\xb2'}, // ogonek
|
|
{"\xc5\x82", '\xb3'}, // l with stroke
|
|
{"\xc2\xb4", '\xb4'}, // acute accent
|
|
{"\xc2\xb5", '\xb5'}, // Mu letter
|
|
{"\xc2\xb6", '\xb6'}, // pilcrow
|
|
{"\xc2\xb7", '\xb7'}, // middle dot
|
|
{"\xc2\xb8", '\xb8'}, // cedilla
|
|
{"\xc4\x85", '\xb9'}, // a with ogonek
|
|
{"\xc5\x9f", '\xba'}, // s-cedilla
|
|
{"\xc2\xbb", '\xbb'}, // right guillemets
|
|
{"\xc4\xbd", '\xbc'}, // Lj-
|
|
{"\xcb\x9d", '\xbd'}, // double acute accent
|
|
{"\xc4\xbe", '\xbe'}, // lj-
|
|
{"\xc5\xbc", '\xbf'}, // z with dot above
|
|
|
|
{"\xc5\x94", '\xc0'}, // R with acute
|
|
{"\xc3\x81", '\xc1'}, // A with acute
|
|
{"\xc3\x82", '\xc2'}, // A-circumflex
|
|
{"\xc4\x82", '\xc3'}, // A-breve
|
|
{"\xc3\x84", '\xc4'}, // A with diaeresis
|
|
{"\xc4\xb9", '\xc5'}, // L with acute
|
|
{"\xc4\x86", '\xc6'}, // C with acute
|
|
{"\xc3\x87", '\xc7'}, // C-cedilla
|
|
{"\xc4\x8c", '\xc8'}, // C with caron
|
|
{"\xc3\x89", '\xc9'}, // C with acute
|
|
{"\xc4\x98", '\xca'}, // E with ogonek
|
|
{"\xc3\x8b", '\xcb'}, // E with diaeresis
|
|
{"\xc3\x8b", '\xcc'}, // E with caron
|
|
{"\xc3\x8d", '\xcd'}, // I with acute
|
|
{"\xc3\x8e", '\xce'}, // I-circumflex
|
|
{"\xc4\x8e", '\xcf'}, // D with caron
|
|
|
|
{"\xc4\x90", '\xd0'}, // crossed D
|
|
{"\xc5\x83", '\xd1'}, // N with acute
|
|
{"\xc5\x87", '\xd2'}, // N with caron
|
|
{"\xc3\x93", '\xd3'}, // O with acute
|
|
{"\xc3\x94", '\xd4'}, // O-circumflex
|
|
{"\xc5\x90", '\xd5'}, // O with dobule accute
|
|
{"\xc3\x96", '\xd6'}, // O with diaeresis
|
|
{"\xc3\x97", '\xd7'}, // multiplication sign
|
|
{"\xc5\x98", '\xd8'}, // R with caron
|
|
{"\xc5\xae", '\xd9'}, // U with diacritic
|
|
{"\xc3\x9a", '\xda'}, // U with acute
|
|
{"\xc5\xb0", '\xdb'}, // U with double accent
|
|
{"\xc3\x9c", '\xdc'}, // U with diaeresis
|
|
{"\xc3\x9d", '\xdd'}, // Y with acute
|
|
{"\xc5\xa2", '\xdf'}, // T-cedilla
|
|
|
|
{"\xc5\x95", '\xe0'}, // r with acute
|
|
{"\xc3\xa1", '\xe1'}, // a with acute
|
|
{"\xc3\xa2", '\xe2'}, // a-circumflex
|
|
{"\xc4\x83", '\xe3'}, // a-breve
|
|
{"\xc3\xa4", '\xe4'}, // a with diaeresis
|
|
{"\xc4\xba", '\xe5'}, // l with acute
|
|
{"\xc4\x87", '\xe6'}, // c with acute
|
|
{"\xc3\xa7", '\xe7'}, // c-cedilla
|
|
{"\xc4\x8d", '\xe8'}, // c with caron
|
|
{"\xc3\xa9", '\xe9'}, // c with acute
|
|
{"\xc4\x99", '\xea'}, // e with ogonek
|
|
{"\xc3\xab", '\xeb'}, // e with diaeresis
|
|
{"\xc4\x9b", '\xec'}, // e with caron
|
|
{"\xc3\xad", '\xed'}, // i with acute
|
|
{"\xc3\xae", '\xee'}, // i-circumflex
|
|
{"\xc4\x8f", '\xef'}, // d with caron
|
|
|
|
{"\xc4\x91", '\xf0'}, // crossed d
|
|
{"\xc5\x84", '\xf1'}, // n with acute
|
|
{"\xc5\x88", '\xf2'}, // n with caron
|
|
{"\xc3\xb3", '\xf3'}, // o with acute
|
|
{"\xc3\xb4", '\xf4'}, // o-circumflex
|
|
{"\xc5\x91", '\xf5'}, // o with double accent
|
|
{"\xc3\xb6", '\xf6'}, // o with diaeresis
|
|
{"\xc3\xb7", '\xf7'}, // division sign
|
|
{"\xc5\x99", '\xf8'}, // r with caron
|
|
{"\xc5\xaf", '\xf9'}, // u with diacritic
|
|
{"\xc3\xba", '\xfa'}, // u with acute
|
|
{"\xc5\xb1", '\xfb'}, // u with double accent
|
|
{"\xc3\xbc", '\xfc'}, // u with diaeresis
|
|
{"\xc3\xbd", '\xfd'}, // y with acute
|
|
{"\xc5\xa3", '\xfe'}, // t-cedilla
|
|
{"\xcb\x99", '\xff'}, // diactric dot
|
|
};
|
|
|
|
inline std::string win1250ToUTF8(const std::string& strRef)
|
|
{
|
|
std::string result;
|
|
result.reserve(strRef.size() * 2);
|
|
|
|
for (auto&& c : strRef)
|
|
{
|
|
if (WINDOWS1250_UTF8_MAP.contains(static_cast<unsigned char>(c)))
|
|
result += WINDOWS1250_UTF8_MAP.at(static_cast<unsigned char>(c));
|
|
else
|
|
result += c;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
inline std::string UTF8ToWin1250(const std::string& strRef)
|
|
{
|
|
std::string result;
|
|
result.reserve(strRef.size());
|
|
size_t charSize = 1;
|
|
|
|
for (size_t i = 0; i < strRef.length(); ++i)
|
|
{
|
|
if (const int c = static_cast<unsigned char>(strRef[i]); c >= 128)
|
|
{
|
|
if (c < 224)
|
|
charSize = 2;
|
|
else if (c < 240)
|
|
charSize = 3;
|
|
else if (c < 248)
|
|
charSize = 4;
|
|
else if (c == 252)
|
|
charSize = 5;
|
|
else
|
|
charSize = 6;
|
|
}
|
|
// Update loop index according to UTF8 charSize;
|
|
i += charSize - 1;
|
|
|
|
if (charSize == 1)
|
|
{
|
|
result += strRef[i];
|
|
continue;
|
|
}
|
|
|
|
if (i + charSize > strRef.length())
|
|
{
|
|
result += '?';
|
|
return result;
|
|
}
|
|
|
|
String utfChar;
|
|
utfChar.reserve(charSize);
|
|
|
|
for (size_t j = 0; j < charSize; ++j)
|
|
{
|
|
utfChar += strRef[i + j];
|
|
}
|
|
|
|
if (UTF8_WINDOWS1250_MAP.contains(utfChar))
|
|
{
|
|
result += static_cast<char>(UTF8_WINDOWS1250_MAP.at(utfChar));
|
|
}
|
|
else
|
|
{
|
|
result += '?';
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
}
|
|
#endif // NONUT_CORE_STRING_HELPERS_H
|