1 /*
2  * Copyright (C) 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "normalize.h"
18 
19 #include <stdint.h>
20 
21 #include <algorithm>
22 #include <string>
23 
24 #include "utf.h"
25 
26 namespace pdfClient {
27 
28 namespace {
29 
30 // pdfClient normally reports line breaks as "\r\n". But when a line ends with
31 // a hyphen, pdfClient reports the hyphen and the line break together as '\x2'.
32 const char32_t kBrokenWordMarker = '\x2';
33 const char32_t kCarriageReturn = '\r';
34 const char32_t kLineFeed = '\n';
35 
36 const char* kGroups[] = {
37         // Treat the broken word marker the same as a hyphen when searching.
38         "-\x2",
39         // Space, tab and newline are all treated as equivalent when searching.
40         " \t\r\n\u00A0",
41         // Put upper,lower,and accented variants of the same letter in the same group
42         // for searching. Generated using data from java.lang.Character
43         "aAªÀÁÂÃÄÅàáâãäåĀāĂ㥹ǍǎǞǟǠǡǺǻȀȁȂȃȦȧ", "bB", "cCÇçĆćĈĉĊċČč", "dDĎďDŽDždžDZDzdz",
44         "eEÈÉÊËèéêëĒēĔĕĖėĘęĚěȄȅȆȇȨȩ", "fF", "gGĜĝĞğĠġĢģǦǧǴǵ", "hHĤĥȞȟ",
45         "iIÌÍÎÏìíîïĨĩĪīĬĭĮįİIJijǏǐȈȉȊȋ", "jJĴĵǰ", "kKĶķǨǩ", "lLĹĺĻļĽľĿŀLJLjlj", "mM", "nNÑñŃńŅņŇňʼnNJNjnjǸǹ",
46         "oOºÒÓÔÕÖòóôõöŌōŎŏŐőƠơǑǒǪǫǬǭȌȍȎȏȪȫȬȭȮȯȰȱ", "pP", "qQ", "rRŔŕŖŗŘřȐȑȒȓ", "sSŚśŜŝŞşŠšſȘș",
47         "tTŢţŤťȚț", "uUÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųƯưǓǔǕǖǗǘǙǚǛǜȔȕȖȗ", "vV", "wWŴŵ", "xX", "yYÝýÿŶŷŸȲȳ",
48         "zZŹźŻżŽž", "æÆǢǣǼǽ", "ðÐ", "øØǾǿ", "þÞ", "đĐ", "ħĦ", "łŁ", "ŋŊ", "œŒ", "ŧŦ", "ƀɃ", "ƃƂ",
49         "ƅƄ", "ƈƇ", "ƌƋ", "ƒƑ", "ƕǶ", "ƙƘ", "ƚȽ", "ƞȠ", "ƣƢ", "ƥƤ", "ƨƧ", "ƭƬ", "ƴƳ", "ƶƵ", "ƹƸ",
50         "ƽƼ", "ƿǷ", "ǝƎ", "ǥǤ", "ȝȜ", "ȣȢ", "ȥȤ", "ȼȻ", "ɂɁ", "ɇɆ", "ɉɈ", "ɋɊ", "ɍɌ", "ɏɎ", "ɓƁ",
51         "ɔƆ", "ɖƉ", "ɗƊ", "əƏ", "ɛƐ", "ɠƓ", "ɣƔ", "ɨƗ", "ɩƖ", "ɯƜ", "ɲƝ", "ɵƟ", "ʀƦ", "ʃƩ", "ʈƮ",
52         "ʉɄ", "ʊƱ", "ʋƲ", "ʌɅ", "ʒƷǮǯ", "ͱͰ", "ͳͲ", "ͷͶ", "ͻϽ", "ͼϾ", "ͽϿ", "αΆΑά", "βΒϐ", "γΓ",
53         "δΔ", "εΈΕέϵ", "ζΖ", "ηΉΗή", "θΘϑϴ", "ιΊΐΙΪίϊ", "κΚϰ", "λΛ", "μµΜ", "νΝ", "ξΞ", "οΌΟό",
54         "πΠϖ", "ρΡϱ", "ςϲ", "σΣϹ", "τΤ", "υΎΥΫΰϋύϒϓϔ", "φΦϕ", "χΧ", "ψΨ", "ωΏΩώ", "ϗϏ", "ϙϘ", "ϛϚ",
55         "ϝϜ", "ϟϞ", "ϡϠ", "ϣϢ", "ϥϤ", "ϧϦ", "ϩϨ", "ϫϪ", "ϭϬ", "ϯϮ", "ϸϷ", "ϻϺ", "аАӐӑӒӓ", "бБ",
56         "вВ", "гЃГѓ", "дД", "еЀЁЕѐёӖӗ", "жЖӁӂӜӝ", "зЗӞӟ", "иЍИЙйѝӢӣӤӥ", "кЌКќ", "лЛ", "мМ", "нН",
57         "оОӦӧ", "пП", "рР", "сС", "тТ", "уЎУўӮӯӰӱӲӳ", "фФ", "хХ", "цЦ", "чЧӴӵ", "шШ", "щЩ", "ъЪ",
58         "ыЫӸӹ", "ьЬ", "эЭӬӭ", "юЮ", "яЯ", "ђЂ", "єЄ", "ѕЅ", "іІЇї", "јЈ", "љЉ", "њЊ", "ћЋ", "џЏ",
59         "ѡѠ", "ѣѢ", "ѥѤ", "ѧѦ", "ѩѨ", "ѫѪ", "ѭѬ", "ѯѮ", "ѱѰ", "ѳѲ", "ѵѴѶѷ", "ѹѸ", "ѻѺ", "ѽѼ", "ѿѾ",
60         "ҁҀ", "ҋҊ", "ҍҌ", "ҏҎ", "ґҐ", "ғҒ", "ҕҔ", "җҖ", "ҙҘ", "қҚ", "ҝҜ", "ҟҞ", "ҡҠ", "ңҢ", "ҥҤ",
61         "ҧҦ", "ҩҨ", "ҫҪ", "ҭҬ", "үҮ", "ұҰ", "ҳҲ", "ҵҴ", "ҷҶ", "ҹҸ", "һҺ", "ҽҼ", "ҿҾ", "ӄӃ", "ӆӅ",
62         "ӈӇ", "ӊӉ", "ӌӋ", "ӎӍ", "ӏӀ", "ӕӔ", "әӘӚӛ", "ӡӠ", "өӨӪӫ", "ӷӶ", "ӻӺ", "ӽӼ"};
63 
64 const size_t kNumGroups = sizeof(kGroups) / sizeof(kGroups[0]);
65 
66 // All of the characters that are normalized have codepoints of < 0x500.
67 const size_t kTableSize = 0x500;
68 
CreateTable()69 const uint16_t* CreateTable() {
70     static uint16_t table[kTableSize];
71     for (size_t i = 0; i < kTableSize; i++) {
72         table[i] = i;
73     }
74     for (size_t i = 0; i < kNumGroups; i++) {
75         std::u32string group = Utf8ToUtf32(kGroups[i]);
76         for (size_t j = 0; j < group.length(); j++) {
77             table[group[j]] = group[0];
78         }
79     }
80     return table;
81 }
82 
83 }  // namespace
84 
NormalizeForSearch(char32_t codepoint)85 char32_t NormalizeForSearch(char32_t codepoint) {
86     // Table is created on first use and cached.
87     static const uint16_t* table = CreateTable();
88     if (codepoint < kTableSize) {
89         return table[codepoint];
90     }
91     return codepoint;
92 }
93 
BothAreSpaces(char32_t left_codepoint,char32_t right_codepoint)94 bool BothAreSpaces(char32_t left_codepoint, char32_t right_codepoint) {
95     return left_codepoint == '\x20' && right_codepoint == '\x20';
96 }
97 
NormalizeStringForSearch(std::u32string * utf32)98 void NormalizeStringForSearch(std::u32string* utf32) {
99     std::transform(utf32->begin(), utf32->end(), utf32->begin(), NormalizeForSearch);
100     // Collapse repeated whitespace into a single space:
101     utf32->erase(std::unique(utf32->begin(), utf32->end(), BothAreSpaces), utf32->end());
102 }
103 
IsSkippableForSearch(char32_t codepoint,char32_t prev_codepoint)104 bool IsSkippableForSearch(char32_t codepoint, char32_t prev_codepoint) {
105     if (codepoint == kBrokenWordMarker) {
106         // This can be skipped so words can be found when broken onto two lines.
107         return true;
108     }
109     if (BothAreSpaces(NormalizeForSearch(codepoint), NormalizeForSearch(prev_codepoint))) {
110         // Repeated whitespace can be skipped so that all whitespace is equivalent.
111         return true;
112     }
113     return false;
114 }
115 
IsLineBreak(char32_t codepoint)116 bool IsLineBreak(char32_t codepoint) {
117     switch (codepoint) {
118         case kBrokenWordMarker:
119         case kLineFeed:
120             return true;
121         default:
122             return false;
123     }
124 }
125 
IsWordBreak(char32_t codepoint)126 bool IsWordBreak(char32_t codepoint) {
127     char32_t normalized = NormalizeForSearch(codepoint);
128     return normalized == ' ' || normalized == '-';
129 }
130 
AppendpdfClientCodepointAsUtf8(char32_t codepoint,std::string * output)131 void AppendpdfClientCodepointAsUtf8(char32_t codepoint, std::string* output) {
132     if (codepoint == kBrokenWordMarker) {
133         output->append("-\r\n");  // We give the user what the text looks like.
134     } else {
135         AppendCodepointAsUtf8(codepoint, output);
136     }
137 }
138 
139 }  // namespace pdfClient