refactor: Refactorized whole project structure
This commit is contained in:
347
source/NoNut/include/StringHelpers.h
Normal file
347
source/NoNut/include/StringHelpers.h
Normal file
@@ -0,0 +1,347 @@
|
||||
#ifndef NONUT_CORE_STRING_HELPERS_H
|
||||
#define NONUT_CORE_STRING_HELPERS_H
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace nonut
|
||||
{
|
||||
const std::unordered_map<unsigned char, std::string_view> WINDOWS1250_UTF8_MAP
|
||||
{
|
||||
{'\x80', "\xe2\x82\xac"}, // euro sign
|
||||
{'\x82', "\xe2\x80\x9a"}, // lower quotation mark
|
||||
{'\x84', "\xe2\x80\x9e"}, // lower quotation marks
|
||||
{'\x85', "\xe2\x80\xa6"}, // ellipsis
|
||||
{'\x86', "\xe2\x80\xa0"}, // dagger
|
||||
{'\x87', "\xe2\x80\xa1"}, // double dagger
|
||||
{'\x89', "\xe2\x80\xb0"}, // per mille
|
||||
{'\x8a', "\xc5\xa0"}, // S with caron
|
||||
{'\x8b', "\xe2\x80\xb9"}, // left guillemet
|
||||
{'\x8c', "\xc5\x9a"}, // S with acute
|
||||
{'\x8d', "\xc5\xa4"}, // T with caron
|
||||
{'\x8e', "\xc5\xbd"}, // Z with caron
|
||||
{'\x8f', "\xc5\xb9"}, // Z with acute
|
||||
|
||||
{'\x91', "\xe2\x80\x98"}, // upper quotation mark (opening)
|
||||
{'\x92', "\xe2\x80\x99"}, // upper quotation mark (closing)
|
||||
{'\x93', "\xe2\x80\x9c"}, // upper quotation marks (opening)
|
||||
{'\x94', "\xe2\x80\x9d"}, // upper quotation marks (closing)
|
||||
{'\x95', "\xe2\x80\xa2"}, // bullet sign
|
||||
{'\x96', "\xe2\x80\x93"}, // en dash
|
||||
{'\x97', "\xe2\x80\x94"}, // em dash
|
||||
{'\x99', "\xe2\x84\xa2"}, // trademark sign
|
||||
{'\x9a', "\xc5\xa1"}, // s with caron
|
||||
{'\x9b', "\xe2\x80\xba"}, // right guillemet
|
||||
{'\x9c', "\xc5\x9b"}, // s with acute
|
||||
{'\x9d', "\xc5\xa5"}, // t with caron
|
||||
{'\x9e', "\xc5\xbe"}, // z with caron
|
||||
{'\x9f', "\xc5\xba"}, // z with acute
|
||||
|
||||
{'\xa0', "\x20"}, // NBSP
|
||||
{'\xa1', "\xcb\x87"}, // caron
|
||||
{'\xa2', "\xcb\x98"}, // breve
|
||||
{'\xa3', "\xc5\x81"}, // L with stroke
|
||||
{'\xa4', "\xc2\xa4"}, // currency sign
|
||||
{'\xa5', "\xc4\x84"}, // A with ogonek
|
||||
{'\xa6', "\xc2\xa6"}, // vertical bar
|
||||
{'\xa7', "\xc2\xa7"}, // section sign
|
||||
{'\xa8', "\xc2\xa8"}, // diaeresis
|
||||
{'\xa9', "\xc2\xa9"}, // copyright sign
|
||||
{'\xaa', "\xc5\x9e"}, // S-cedilla
|
||||
{'\xab', "\xc2\xab"}, // left guillemets
|
||||
{'\xac', "\xc2\xac"}, // negation
|
||||
{'\xad', "\xc2\xad"}, // soft hyphen
|
||||
{'\xae', "\xc2\xae"}, // registered trademark sign
|
||||
{'\xaf', "\xc5\xbb"}, // Z with dot above
|
||||
|
||||
{'\xb0', "\xc2\xb0"}, // degree sign
|
||||
{'\xb1', "\xc2\xb1"}, // plus-minus sign
|
||||
{'\xb2', "\xcb\x9b"}, // ogonek
|
||||
{'\xb3', "\xc5\x82"}, // l with stroke
|
||||
{'\xb4', "\xc2\xb4"}, // acute accent
|
||||
{'\xb5', "\xc2\xb5"}, // Mu letter
|
||||
{'\xb6', "\xc2\xb6"}, // pilcrow
|
||||
{'\xb7', "\xc2\xb7"}, // middle dot
|
||||
{'\xb8', "\xc2\xb8"}, // cedilla
|
||||
{'\xb9', "\xc4\x85"}, // a with ogonek
|
||||
{'\xba', "\xc5\x9f"}, // s-cedilla
|
||||
{'\xbb', "\xc2\xbb"}, // right guillemets
|
||||
{'\xbc', "\xc4\xbd"}, // Lj-
|
||||
{'\xbd', "\xcb\x9d"}, // double acute accent
|
||||
{'\xbe', "\xc4\xbe"}, // lj-
|
||||
{'\xbf', "\xc5\xbc"}, // z with dot above
|
||||
|
||||
{'\xc0', "\xc5\x94"}, // R with acute
|
||||
{'\xc1', "\xc3\x81"}, // A with acute
|
||||
{'\xc2', "\xc3\x82"}, // A-circumflex
|
||||
{'\xc3', "\xc4\x82"}, // A-breve
|
||||
{'\xc4', "\xc3\x84"}, // A with diaeresis
|
||||
{'\xc5', "\xc4\xb9"}, // L with acute
|
||||
{'\xc6', "\xc4\x86"}, // C with acute
|
||||
{'\xc7', "\xc3\x87"}, // C-cedilla
|
||||
{'\xc8', "\xc4\x8c"}, // C with caron
|
||||
{'\xc9', "\xc3\x89"}, // C with acute
|
||||
{'\xca', "\xc4\x98"}, // E with ogonek
|
||||
{'\xcb', "\xc3\x8b"}, // E with diaeresis
|
||||
{'\xcc', "\xc3\x8b"}, // E with caron
|
||||
{'\xcd', "\xc3\x8d"}, // I with acute
|
||||
{'\xce', "\xc3\x8e"}, // I-circumflex
|
||||
{'\xcf', "\xc4\x8e"}, // D with caron
|
||||
|
||||
{'\xd0', "\xc4\x90"}, // crossed D
|
||||
{'\xd1', "\xc5\x83"}, // N with acute
|
||||
{'\xd2', "\xc5\x87"}, // N with caron
|
||||
{'\xd3', "\xc3\x93"}, // O with acute
|
||||
{'\xd4', "\xc3\x94"}, // O-circumflex
|
||||
{'\xd5', "\xc5\x90"}, // O with dobule accute
|
||||
{'\xd6', "\xc3\x96"}, // O with diaeresis
|
||||
{'\xd7', "\xc3\x97"}, // multiplication sign
|
||||
{'\xd8', "\xc5\x98"}, // R with caron
|
||||
{'\xd9', "\xc5\xae"}, // U with diacritic
|
||||
{'\xda', "\xc3\x9a"}, // U with acute
|
||||
{'\xdb', "\xc5\xb0"}, // U with double accent
|
||||
{'\xdc', "\xc3\x9c"}, // U with diaeresis
|
||||
{'\xdd', "\xc3\x9d"}, // Y with acute
|
||||
{'\xdf', "\xc5\xa2"}, // T-cedilla
|
||||
|
||||
{'\xe0', "\xc5\x95"}, // r with acute
|
||||
{'\xe1', "\xc3\xa1"}, // a with acute
|
||||
{'\xe2', "\xc3\xa2"}, // a-circumflex
|
||||
{'\xe3', "\xc4\x83"}, // a-breve
|
||||
{'\xe4', "\xc3\xa4"}, // a with diaeresis
|
||||
{'\xe5', "\xc4\xba"}, // l with acute
|
||||
{'\xe6', "\xc4\x87"}, // c with acute
|
||||
{'\xe7', "\xc3\xa7"}, // c-cedilla
|
||||
{'\xe8', "\xc4\x8d"}, // c with caron
|
||||
{'\xe9', "\xc3\xa9"}, // c with acute
|
||||
{'\xea', "\xc4\x99"}, // e with ogonek
|
||||
{'\xeb', "\xc3\xab"}, // e with diaeresis
|
||||
{'\xec', "\xc4\x9b"}, // e with caron
|
||||
{'\xed', "\xc3\xad"}, // i with acute
|
||||
{'\xee', "\xc3\xae"}, // i-circumflex
|
||||
{'\xef', "\xc4\x8f"}, // d with caron
|
||||
|
||||
{'\xf0', "\xc4\x91"}, // crossed d
|
||||
{'\xf1', "\xc5\x84"}, // n with acute
|
||||
{'\xf2', "\xc5\x88"}, // n with caron
|
||||
{'\xf3', "\xc3\xb3"}, // o with acute
|
||||
{'\xf4', "\xc3\xb4"}, // o-circumflex
|
||||
{'\xf5', "\xc5\x91"}, // o with double accent
|
||||
{'\xf6', "\xc3\xb6"}, // o with diaeresis
|
||||
{'\xf7', "\xc3\xb7"}, // division sign
|
||||
{'\xf8', "\xc5\x99"}, // r with caron
|
||||
{'\xf9', "\xc5\xaf"}, // u with diacritic
|
||||
{'\xfa', "\xc3\xba"}, // u with acute
|
||||
{'\xfb', "\xc5\xb1"}, // u with double accent
|
||||
{'\xfc', "\xc3\xbc"}, // u with diaeresis
|
||||
{'\xfd', "\xc3\xbd"}, // y with acute
|
||||
{'\xfe', "\xc5\xa3"}, // t-cedilla
|
||||
{'\xff', "\xcb\x99"}, // diactric dot
|
||||
};
|
||||
|
||||
const std::unordered_map<std::string_view, unsigned char> UTF8_WINDOWS1250_MAP
|
||||
{
|
||||
{"\xe2\x82\xac", '\x80'}, // euro sign
|
||||
{"\xe2\x80\x9a", '\x82'}, // lower quotation mark
|
||||
{"\xe2\x80\x9e", '\x84'}, // lower quotation marks
|
||||
{"\xe2\x80\xa6", '\x85'}, // ellipsis
|
||||
{"\xe2\x80\xa0", '\x86'}, // dagger
|
||||
{"\xe2\x80\xa1", '\x87'}, // double dagger
|
||||
{"\xe2\x80\xb0", '\x89'}, // per mille
|
||||
{"\xc5\xa0", '\x8a'}, // S with caron
|
||||
{"\xe2\x80\xb9", '\x8b'}, // left guillemet
|
||||
{"\xc5\x9a", '\x8c'}, // S with acute
|
||||
{"\xc5\xa4", '\x8d'}, // T with caron
|
||||
{"\xc5\xbd", '\x8e'}, // Z with caron
|
||||
{"\xc5\xb9", '\x8f'}, // Z with acute
|
||||
|
||||
{"\xe2\x80\x98", '\x91'}, // upper quotation mark (opening)
|
||||
{"\xe2\x80\x99", '\x92'}, // upper quotation mark (closing)
|
||||
{"\xe2\x80\x9c", '\x93'}, // upper quotation marks (opening)
|
||||
{"\xe2\x80\x9d", '\x94'}, // upper quotation marks (closing)
|
||||
{"\xe2\x80\xa2", '\x95'}, // bullet sign
|
||||
{"\xe2\x80\x93", '\x96'}, // en dash
|
||||
{"\xe2\x80\x94", '\x97'}, // em dash
|
||||
{"\xe2\x84\xa2", '\x99'}, // trademark sign
|
||||
{"\xc5\xa1", '\x9a'}, // s with caron
|
||||
{"\xe2\x80\xba", '\x9b'}, // right guillemet
|
||||
{"\xc5\x9b", '\x9c'}, // s with acute
|
||||
{"\xc5\xa5", '\x9d'}, // t with caron
|
||||
{"\xc5\xbe", '\x9e'}, // z with caron
|
||||
{"\xc5\xba", '\x9f'}, // z with acute
|
||||
|
||||
{"\x20", '\xa0'}, // NBSP
|
||||
{"\xcb\x87", '\xa1'}, // caron
|
||||
{"\xcb\x98", '\xa2'}, // breve
|
||||
{"\xc5\x81", '\xa3'}, // L with stroke
|
||||
{"\xc2\xa4", '\xa4'}, // currency sign
|
||||
{"\xc4\x84", '\xa5'}, // A with ogonek
|
||||
{"\xc2\xa6", '\xa6'}, // vertical bar
|
||||
{"\xc2\xa7", '\xa7'}, // section sign
|
||||
{"\xc2\xa8", '\xa8'}, // diaeresis
|
||||
{"\xc2\xa9", '\xa9'}, // copyright sign
|
||||
{"\xc5\x9e", '\xaa'}, // S-cedilla
|
||||
{"\xc2\xab", '\xab'}, // left guillemets
|
||||
{"\xc2\xac", '\xac'}, // negation
|
||||
{"\xc2\xad", '\xad'}, // soft hyphen
|
||||
{"\xc2\xae", '\xae'}, // registered trademark sign
|
||||
{"\xc5\xbb", '\xaf'}, // Z with dot above
|
||||
|
||||
{"\xc2\xb0", '\xb0'}, // degree sign
|
||||
{"\xc2\xb1", '\xb1'}, // plus-minus sign
|
||||
{"\xcb\x9b", '\xb2'}, // ogonek
|
||||
{"\xc5\x82", '\xb3'}, // l with stroke
|
||||
{"\xc2\xb4", '\xb4'}, // acute accent
|
||||
{"\xc2\xb5", '\xb5'}, // Mu letter
|
||||
{"\xc2\xb6", '\xb6'}, // pilcrow
|
||||
{"\xc2\xb7", '\xb7'}, // middle dot
|
||||
{"\xc2\xb8", '\xb8'}, // cedilla
|
||||
{"\xc4\x85", '\xb9'}, // a with ogonek
|
||||
{"\xc5\x9f", '\xba'}, // s-cedilla
|
||||
{"\xc2\xbb", '\xbb'}, // right guillemets
|
||||
{"\xc4\xbd", '\xbc'}, // Lj-
|
||||
{"\xcb\x9d", '\xbd'}, // double acute accent
|
||||
{"\xc4\xbe", '\xbe'}, // lj-
|
||||
{"\xc5\xbc", '\xbf'}, // z with dot above
|
||||
|
||||
{"\xc5\x94", '\xc0'}, // R with acute
|
||||
{"\xc3\x81", '\xc1'}, // A with acute
|
||||
{"\xc3\x82", '\xc2'}, // A-circumflex
|
||||
{"\xc4\x82", '\xc3'}, // A-breve
|
||||
{"\xc3\x84", '\xc4'}, // A with diaeresis
|
||||
{"\xc4\xb9", '\xc5'}, // L with acute
|
||||
{"\xc4\x86", '\xc6'}, // C with acute
|
||||
{"\xc3\x87", '\xc7'}, // C-cedilla
|
||||
{"\xc4\x8c", '\xc8'}, // C with caron
|
||||
{"\xc3\x89", '\xc9'}, // C with acute
|
||||
{"\xc4\x98", '\xca'}, // E with ogonek
|
||||
{"\xc3\x8b", '\xcb'}, // E with diaeresis
|
||||
{"\xc3\x8b", '\xcc'}, // E with caron
|
||||
{"\xc3\x8d", '\xcd'}, // I with acute
|
||||
{"\xc3\x8e", '\xce'}, // I-circumflex
|
||||
{"\xc4\x8e", '\xcf'}, // D with caron
|
||||
|
||||
{"\xc4\x90", '\xd0'}, // crossed D
|
||||
{"\xc5\x83", '\xd1'}, // N with acute
|
||||
{"\xc5\x87", '\xd2'}, // N with caron
|
||||
{"\xc3\x93", '\xd3'}, // O with acute
|
||||
{"\xc3\x94", '\xd4'}, // O-circumflex
|
||||
{"\xc5\x90", '\xd5'}, // O with dobule accute
|
||||
{"\xc3\x96", '\xd6'}, // O with diaeresis
|
||||
{"\xc3\x97", '\xd7'}, // multiplication sign
|
||||
{"\xc5\x98", '\xd8'}, // R with caron
|
||||
{"\xc5\xae", '\xd9'}, // U with diacritic
|
||||
{"\xc3\x9a", '\xda'}, // U with acute
|
||||
{"\xc5\xb0", '\xdb'}, // U with double accent
|
||||
{"\xc3\x9c", '\xdc'}, // U with diaeresis
|
||||
{"\xc3\x9d", '\xdd'}, // Y with acute
|
||||
{"\xc5\xa2", '\xdf'}, // T-cedilla
|
||||
|
||||
{"\xc5\x95", '\xe0'}, // r with acute
|
||||
{"\xc3\xa1", '\xe1'}, // a with acute
|
||||
{"\xc3\xa2", '\xe2'}, // a-circumflex
|
||||
{"\xc4\x83", '\xe3'}, // a-breve
|
||||
{"\xc3\xa4", '\xe4'}, // a with diaeresis
|
||||
{"\xc4\xba", '\xe5'}, // l with acute
|
||||
{"\xc4\x87", '\xe6'}, // c with acute
|
||||
{"\xc3\xa7", '\xe7'}, // c-cedilla
|
||||
{"\xc4\x8d", '\xe8'}, // c with caron
|
||||
{"\xc3\xa9", '\xe9'}, // c with acute
|
||||
{"\xc4\x99", '\xea'}, // e with ogonek
|
||||
{"\xc3\xab", '\xeb'}, // e with diaeresis
|
||||
{"\xc4\x9b", '\xec'}, // e with caron
|
||||
{"\xc3\xad", '\xed'}, // i with acute
|
||||
{"\xc3\xae", '\xee'}, // i-circumflex
|
||||
{"\xc4\x8f", '\xef'}, // d with caron
|
||||
|
||||
{"\xc4\x91", '\xf0'}, // crossed d
|
||||
{"\xc5\x84", '\xf1'}, // n with acute
|
||||
{"\xc5\x88", '\xf2'}, // n with caron
|
||||
{"\xc3\xb3", '\xf3'}, // o with acute
|
||||
{"\xc3\xb4", '\xf4'}, // o-circumflex
|
||||
{"\xc5\x91", '\xf5'}, // o with double accent
|
||||
{"\xc3\xb6", '\xf6'}, // o with diaeresis
|
||||
{"\xc3\xb7", '\xf7'}, // division sign
|
||||
{"\xc5\x99", '\xf8'}, // r with caron
|
||||
{"\xc5\xaf", '\xf9'}, // u with diacritic
|
||||
{"\xc3\xba", '\xfa'}, // u with acute
|
||||
{"\xc5\xb1", '\xfb'}, // u with double accent
|
||||
{"\xc3\xbc", '\xfc'}, // u with diaeresis
|
||||
{"\xc3\xbd", '\xfd'}, // y with acute
|
||||
{"\xc5\xa3", '\xfe'}, // t-cedilla
|
||||
{"\xcb\x99", '\xff'}, // diactric dot
|
||||
};
|
||||
|
||||
inline std::string win1250ToUTF8(const std::string& strRef)
|
||||
{
|
||||
std::string result;
|
||||
result.reserve(strRef.size() * 2);
|
||||
|
||||
for (auto&& c : strRef)
|
||||
{
|
||||
if (WINDOWS1250_UTF8_MAP.contains(static_cast<unsigned char>(c)))
|
||||
result += WINDOWS1250_UTF8_MAP.at(static_cast<unsigned char>(c));
|
||||
else
|
||||
result += c;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::string UTF8ToWin1250(const std::string& strRef)
|
||||
{
|
||||
std::string result;
|
||||
result.reserve(strRef.size());
|
||||
size_t charSize = 1;
|
||||
|
||||
for (size_t i = 0; i < strRef.length(); ++i)
|
||||
{
|
||||
if (const int c = static_cast<unsigned char>(strRef[i]); c >= 128)
|
||||
{
|
||||
if (c < 224)
|
||||
charSize = 2;
|
||||
else if (c < 240)
|
||||
charSize = 3;
|
||||
else if (c < 248)
|
||||
charSize = 4;
|
||||
else if (c == 252)
|
||||
charSize = 5;
|
||||
else
|
||||
charSize = 6;
|
||||
}
|
||||
// Update loop index according to UTF8 charSize;
|
||||
i += charSize - 1;
|
||||
|
||||
if (charSize == 1)
|
||||
{
|
||||
result += strRef[i];
|
||||
continue;
|
||||
}
|
||||
|
||||
if (i + charSize > strRef.length())
|
||||
{
|
||||
result += '?';
|
||||
return result;
|
||||
}
|
||||
|
||||
String utfChar;
|
||||
utfChar.reserve(charSize);
|
||||
|
||||
for (size_t j = 0; j < charSize; ++j)
|
||||
{
|
||||
utfChar += strRef[i + j];
|
||||
}
|
||||
|
||||
if (UTF8_WINDOWS1250_MAP.contains(utfChar))
|
||||
{
|
||||
result += static_cast<char>(UTF8_WINDOWS1250_MAP.at(utfChar));
|
||||
}
|
||||
else
|
||||
{
|
||||
result += '?';
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
#endif // NONUT_CORE_STRING_HELPERS_H
|
||||
Reference in New Issue
Block a user