Complex Unicode Cases in C#

Complex Unicode Cases in C#

1. Handling Bidirectional Text

One of the most challenging scenarios in internationalization is managing bidirectional text (LTR and RTL).

public class BidiTextManager
{
    public static string NormalizeBidiText(string input)
    {
        const char LRM = '\u200E'; // Left-to-Right Mark
        const char RLM = '\u200F'; // Right-to-Left Mark
        
        var result = new StringBuilder();
        bool isRtl = false;
        
        foreach (char c in input)
        {
            if (IsRtlCharacter(c) && !isRtl)
            {
                result.Append(RLM);
                isRtl = true;
            }
            else if (!IsRtlCharacter(c) && isRtl)
            {
                result.Append(LRM);
                isRtl = false;
            }
            result.Append(c);
        }
        
        return result.ToString();
    }
    
    private static bool IsRtlCharacter(char c)
    {
        return CharUnicodeInfo.GetBidiCategory(c) == UnicodeCategory.RightToLeft;
    }
}        

2. Text Normalization for Multiple Sources

When text comes from different sources (mobile, web, desktop), we often encounter different normalization forms:

public class TextNormalizer
{
    public class NormalizationResult
    {
        public string NormalizedText { get; set; }
        public bool HasChanges { get; set; }
        public Dictionary<string, string> Modifications { get; set; }
    }

    public static NormalizationResult NormalizeText(string input)
    {
        var result = new NormalizationResult
        {
            Modifications = new Dictionary<string, string>()
        };

        string original = input;
        input = input.Normalize(NormalizationForm.FormC);
        input = NormalizeSpaces(input);
        input = NormalizePunctuation(input);
        
        result.NormalizedText = input;
        result.HasChanges = original != input;
        
        return result;
    }

    private static string NormalizeSpaces(string input)
    {
        var spaces = new[]
        {
            '\u00A0', // NO-BREAK SPACE
            '\u2002', // EN SPACE
            '\u2003', // EM SPACE
            '\u2009'  // THIN SPACE
        };
        
        return spaces.Aggregate(input, (current, space) => 
            current.Replace(space, ' '));
    }

    private static string NormalizePunctuation(string input)
    {
        var replacements = new Dictionary<string, string>
        {
            {"\u2018", "'"}, // LEFT SINGLE QUOTATION
            {"\u2019", "'"}, // RIGHT SINGLE QUOTATION
            {"\u201C", "\""}, // LEFT DOUBLE QUOTATION
            {"\u201D", "\""}, // RIGHT DOUBLE QUOTATION
            {"\u2026", "..."} // HORIZONTAL ELLIPSIS
        };
        
        return replacements.Aggregate(input, (current, replacement) =>
            current.Replace(replacement.Key, replacement.Value));
    }
}        

3. Unicode Security Concerns

Unicode presents security challenges, such as homoglyph attacks:

public class SecurityValidator
{
    public static bool IsHomoglyphAttackAttempt(string input)
    {
        var homoglyphs = new Dictionary<char, char[]>
        {
            {'a', new[] {'а', 'α', 'a'}}, // Latin 'a', Cyrillic 'а', Greek 'α'
            {'e', new[] {'е', 'э', 'ε'}},
            {'o', new[] {'о', 'ο', '?'}}
        };

        foreach (var kvp in homoglyphs)
        {
            if (input.Contains(kvp.Key) && kvp.Value.Any(input.Contains))
            {
                return true;
            }
        }
        
        return false;
    }

    public static string SanitizeInput(string input)
    {
        var replacementRules = new Dictionary<UnicodeCategory, string>
        {
            {UnicodeCategory.Control, string.Empty},
            {UnicodeCategory.Format, string.Empty},
            {UnicodeCategory.Surrogate, "?"},
            {UnicodeCategory.PrivateUse, "?"},
            {UnicodeCategory.OtherNotAssigned, "?"}
        };

        var result = new StringBuilder();
        
        foreach (var c in input.Normalize(NormalizationForm.FormKC))
        {
            var category = CharUnicodeInfo.GetUnicodeCategory(c);
            if (replacementRules.TryGetValue(category, out var replacement))
            {
                result.Append(replacement);
            }
            else
            {
                result.Append(c);
            }
        }
        
        return result.ToString();
    }
}        

4. Multilingual Search Optimization

Implementing efficient search in multilingual text:

public class MultilingualSearchEngine
{
    private class SearchIndex
    {
        public Dictionary<string, HashSet<int>> NormalizedWordLocations { get; }
        public Dictionary<int, string> OriginalWords { get; }

        public SearchIndex()
        {
            NormalizedWordLocations = new Dictionary<string, HashSet<int>>();
            OriginalWords = new Dictionary<int, string>();
        }
    }

    private readonly SearchIndex _index = new SearchIndex();
    private readonly CultureInfo[] _supportedCultures;

    public MultilingualSearchEngine(params CultureInfo[] cultures)
    {
        _supportedCultures = cultures;
    }

    public void IndexText(string text, int documentId)
    {
        var words = text.Split(new[] {' ', '\t', '\n', '\r'}, 
            StringSplitOptions.RemoveEmptyEntries);

        foreach (var word in words)
        {
            var normalizedVersions = NormalizeWordForAllCultures(word);
            foreach (var normalized in normalizedVersions)
            {
                if (!_index.NormalizedWordLocations.ContainsKey(normalized))
                {
                    _index.NormalizedWordLocations[normalized] = new HashSet<int>();
                }
                _index.NormalizedWordLocations[normalized].Add(documentId);
                _index.OriginalWords[documentId] = word;
            }
        }
    }

    private IEnumerable<string> NormalizeWordForAllCultures(string word)
    {
        var results = new HashSet<string>();
        
        foreach (var culture in _supportedCultures)
        {
            var normalized = word.ToLower(culture)
                               .Normalize(NormalizationForm.FormKD);
            
            results.Add(normalized);
            
            var withoutDiacritics = new string(
                normalized.Where(c => CharUnicodeInfo
                    .GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
                    .ToArray());
            
            results.Add(withoutDiacritics);
        }
        
        return results;
    }

    public IEnumerable<SearchResult> Search(string query)
    {
        var normalizedQueries = NormalizeWordForAllCultures(query);
        var results = new List<SearchResult>();

        foreach (var normalizedQuery in normalizedQueries)
        {
            if (_index.NormalizedWordLocations.TryGetValue(normalizedQuery, 
                out var locations))
            {
                foreach (var location in locations)
                {
                    results.Add(new SearchResult
                    {
                        DocumentId = location,
                        OriginalWord = _index.OriginalWords[location],
                        NormalizedQuery = normalizedQuery
                    });
                }
            }
        }

        return results.Distinct();
    }

    public class SearchResult
    {
        public int DocumentId { get; set; }
        public string OriginalWord { get; set; }
        public string NormalizedQuery { get; set; }
    }
}        

Conclusion

Working with Unicode requires careful consideration and proper approaches. The examples above demonstrate:

  1. Proper bidirectional text handling
  2. Text normalization importance
  3. Security considerations
  4. Multilingual search optimization

These solutions are crucial for creating modern, multilingual applications.

要查看或添加评论,请登录

David Shergilashvili的更多文章