Complex Unicode Cases in C#
David Shergilashvili
???? Engineering Manager | ??? .NET Solution Architect | ?? Software Developer | ?? Herding Cats and Microservices
1. Handling Bidirectional Text
One of the most challenging scenarios in internationalization is managing bidirectional text (LTR and RTL).
public class BidiTextManager
{
public static string NormalizeBidiText(string input)
{
const char LRM = '\u200E'; // Left-to-Right Mark
const char RLM = '\u200F'; // Right-to-Left Mark
var result = new StringBuilder();
bool isRtl = false;
foreach (char c in input)
{
if (IsRtlCharacter(c) && !isRtl)
{
result.Append(RLM);
isRtl = true;
}
else if (!IsRtlCharacter(c) && isRtl)
{
result.Append(LRM);
isRtl = false;
}
result.Append(c);
}
return result.ToString();
}
private static bool IsRtlCharacter(char c)
{
return CharUnicodeInfo.GetBidiCategory(c) == UnicodeCategory.RightToLeft;
}
}
2. Text Normalization for Multiple Sources
When text comes from different sources (mobile, web, desktop), we often encounter different normalization forms:
public class TextNormalizer
{
public class NormalizationResult
{
public string NormalizedText { get; set; }
public bool HasChanges { get; set; }
public Dictionary<string, string> Modifications { get; set; }
}
public static NormalizationResult NormalizeText(string input)
{
var result = new NormalizationResult
{
Modifications = new Dictionary<string, string>()
};
string original = input;
input = input.Normalize(NormalizationForm.FormC);
input = NormalizeSpaces(input);
input = NormalizePunctuation(input);
result.NormalizedText = input;
result.HasChanges = original != input;
return result;
}
private static string NormalizeSpaces(string input)
{
var spaces = new[]
{
'\u00A0', // NO-BREAK SPACE
'\u2002', // EN SPACE
'\u2003', // EM SPACE
'\u2009' // THIN SPACE
};
return spaces.Aggregate(input, (current, space) =>
current.Replace(space, ' '));
}
private static string NormalizePunctuation(string input)
{
var replacements = new Dictionary<string, string>
{
{"\u2018", "'"}, // LEFT SINGLE QUOTATION
{"\u2019", "'"}, // RIGHT SINGLE QUOTATION
{"\u201C", "\""}, // LEFT DOUBLE QUOTATION
{"\u201D", "\""}, // RIGHT DOUBLE QUOTATION
{"\u2026", "..."} // HORIZONTAL ELLIPSIS
};
return replacements.Aggregate(input, (current, replacement) =>
current.Replace(replacement.Key, replacement.Value));
}
}
3. Unicode Security Concerns
Unicode presents security challenges, such as homoglyph attacks:
public class SecurityValidator
{
public static bool IsHomoglyphAttackAttempt(string input)
{
var homoglyphs = new Dictionary<char, char[]>
{
{'a', new[] {'а', 'α', 'a'}}, // Latin 'a', Cyrillic 'а', Greek 'α'
{'e', new[] {'е', 'э', 'ε'}},
{'o', new[] {'о', 'ο', '?'}}
};
foreach (var kvp in homoglyphs)
{
if (input.Contains(kvp.Key) && kvp.Value.Any(input.Contains))
{
return true;
}
}
return false;
}
public static string SanitizeInput(string input)
{
var replacementRules = new Dictionary<UnicodeCategory, string>
{
{UnicodeCategory.Control, string.Empty},
{UnicodeCategory.Format, string.Empty},
{UnicodeCategory.Surrogate, "?"},
{UnicodeCategory.PrivateUse, "?"},
{UnicodeCategory.OtherNotAssigned, "?"}
};
var result = new StringBuilder();
foreach (var c in input.Normalize(NormalizationForm.FormKC))
{
var category = CharUnicodeInfo.GetUnicodeCategory(c);
if (replacementRules.TryGetValue(category, out var replacement))
{
result.Append(replacement);
}
else
{
result.Append(c);
}
}
return result.ToString();
}
}
4. Multilingual Search Optimization
Implementing efficient search in multilingual text:
public class MultilingualSearchEngine
{
private class SearchIndex
{
public Dictionary<string, HashSet<int>> NormalizedWordLocations { get; }
public Dictionary<int, string> OriginalWords { get; }
public SearchIndex()
{
NormalizedWordLocations = new Dictionary<string, HashSet<int>>();
OriginalWords = new Dictionary<int, string>();
}
}
private readonly SearchIndex _index = new SearchIndex();
private readonly CultureInfo[] _supportedCultures;
public MultilingualSearchEngine(params CultureInfo[] cultures)
{
_supportedCultures = cultures;
}
public void IndexText(string text, int documentId)
{
var words = text.Split(new[] {' ', '\t', '\n', '\r'},
StringSplitOptions.RemoveEmptyEntries);
foreach (var word in words)
{
var normalizedVersions = NormalizeWordForAllCultures(word);
foreach (var normalized in normalizedVersions)
{
if (!_index.NormalizedWordLocations.ContainsKey(normalized))
{
_index.NormalizedWordLocations[normalized] = new HashSet<int>();
}
_index.NormalizedWordLocations[normalized].Add(documentId);
_index.OriginalWords[documentId] = word;
}
}
}
private IEnumerable<string> NormalizeWordForAllCultures(string word)
{
var results = new HashSet<string>();
foreach (var culture in _supportedCultures)
{
var normalized = word.ToLower(culture)
.Normalize(NormalizationForm.FormKD);
results.Add(normalized);
var withoutDiacritics = new string(
normalized.Where(c => CharUnicodeInfo
.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
.ToArray());
results.Add(withoutDiacritics);
}
return results;
}
public IEnumerable<SearchResult> Search(string query)
{
var normalizedQueries = NormalizeWordForAllCultures(query);
var results = new List<SearchResult>();
foreach (var normalizedQuery in normalizedQueries)
{
if (_index.NormalizedWordLocations.TryGetValue(normalizedQuery,
out var locations))
{
foreach (var location in locations)
{
results.Add(new SearchResult
{
DocumentId = location,
OriginalWord = _index.OriginalWords[location],
NormalizedQuery = normalizedQuery
});
}
}
}
return results.Distinct();
}
public class SearchResult
{
public int DocumentId { get; set; }
public string OriginalWord { get; set; }
public string NormalizedQuery { get; set; }
}
}
Conclusion
Working with Unicode requires careful consideration and proper approaches. The examples above demonstrate:
These solutions are crucial for creating modern, multilingual applications.