1 /*
2 * Copyright (C) 2024 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "normalize.h"
18
19 #include <stdint.h>
20
21 #include <algorithm>
22 #include <string>
23
24 #include "utf.h"
25
26 namespace pdfClient {
27
28 namespace {
29
30 // pdfClient normally reports line breaks as "\r\n". But when a line ends with
31 // a hyphen, pdfClient reports the hyphen and the line break together as '\x2'.
32 const char32_t kBrokenWordMarker = '\x2';
33 const char32_t kCarriageReturn = '\r';
34 const char32_t kLineFeed = '\n';
35
36 const char* kGroups[] = {
37 // Treat the broken word marker the same as a hyphen when searching.
38 "-\x2",
39 // Space, tab and newline are all treated as equivalent when searching.
40 " \t\r\n\u00A0",
41 // Put upper,lower,and accented variants of the same letter in the same group
42 // for searching. Generated using data from java.lang.Character
43 "aAªÀÁÂÃÄÅàáâãäåĀāĂ㥹ǍǎǞǟǠǡǺǻȀȁȂȃȦȧ", "bB", "cCÇçĆćĈĉĊċČč", "dDĎďDŽDždžDZDzdz",
44 "eEÈÉÊËèéêëĒēĔĕĖėĘęĚěȄȅȆȇȨȩ", "fF", "gGĜĝĞğĠġĢģǦǧǴǵ", "hHĤĥȞȟ",
45 "iIÌÍÎÏìíîïĨĩĪīĬĭĮįİIJijǏǐȈȉȊȋ", "jJĴĵǰ", "kKĶķǨǩ", "lLĹĺĻļĽľĿŀLJLjlj", "mM", "nNÑñŃńŅņŇňʼnNJNjnjǸǹ",
46 "oOºÒÓÔÕÖòóôõöŌōŎŏŐőƠơǑǒǪǫǬǭȌȍȎȏȪȫȬȭȮȯȰȱ", "pP", "qQ", "rRŔŕŖŗŘřȐȑȒȓ", "sSŚśŜŝŞşŠšſȘș",
47 "tTŢţŤťȚț", "uUÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųƯưǓǔǕǖǗǘǙǚǛǜȔȕȖȗ", "vV", "wWŴŵ", "xX", "yYÝýÿŶŷŸȲȳ",
48 "zZŹźŻżŽž", "æÆǢǣǼǽ", "ðÐ", "øØǾǿ", "þÞ", "đĐ", "ħĦ", "łŁ", "ŋŊ", "œŒ", "ŧŦ", "ƀɃ", "ƃƂ",
49 "ƅƄ", "ƈƇ", "ƌƋ", "ƒƑ", "ƕǶ", "ƙƘ", "ƚȽ", "ƞȠ", "ƣƢ", "ƥƤ", "ƨƧ", "ƭƬ", "ƴƳ", "ƶƵ", "ƹƸ",
50 "ƽƼ", "ƿǷ", "ǝƎ", "ǥǤ", "ȝȜ", "ȣȢ", "ȥȤ", "ȼȻ", "ɂɁ", "ɇɆ", "ɉɈ", "ɋɊ", "ɍɌ", "ɏɎ", "ɓƁ",
51 "ɔƆ", "ɖƉ", "ɗƊ", "əƏ", "ɛƐ", "ɠƓ", "ɣƔ", "ɨƗ", "ɩƖ", "ɯƜ", "ɲƝ", "ɵƟ", "ʀƦ", "ʃƩ", "ʈƮ",
52 "ʉɄ", "ʊƱ", "ʋƲ", "ʌɅ", "ʒƷǮǯ", "ͱͰ", "ͳͲ", "ͷͶ", "ͻϽ", "ͼϾ", "ͽϿ", "αΆΑά", "βΒϐ", "γΓ",
53 "δΔ", "εΈΕέϵ", "ζΖ", "ηΉΗή", "θΘϑϴ", "ιΊΐΙΪίϊ", "κΚϰ", "λΛ", "μµΜ", "νΝ", "ξΞ", "οΌΟό",
54 "πΠϖ", "ρΡϱ", "ςϲ", "σΣϹ", "τΤ", "υΎΥΫΰϋύϒϓϔ", "φΦϕ", "χΧ", "ψΨ", "ωΏΩώ", "ϗϏ", "ϙϘ", "ϛϚ",
55 "ϝϜ", "ϟϞ", "ϡϠ", "ϣϢ", "ϥϤ", "ϧϦ", "ϩϨ", "ϫϪ", "ϭϬ", "ϯϮ", "ϸϷ", "ϻϺ", "аАӐӑӒӓ", "бБ",
56 "вВ", "гЃГѓ", "дД", "еЀЁЕѐёӖӗ", "жЖӁӂӜӝ", "зЗӞӟ", "иЍИЙйѝӢӣӤӥ", "кЌКќ", "лЛ", "мМ", "нН",
57 "оОӦӧ", "пП", "рР", "сС", "тТ", "уЎУўӮӯӰӱӲӳ", "фФ", "хХ", "цЦ", "чЧӴӵ", "шШ", "щЩ", "ъЪ",
58 "ыЫӸӹ", "ьЬ", "эЭӬӭ", "юЮ", "яЯ", "ђЂ", "єЄ", "ѕЅ", "іІЇї", "јЈ", "љЉ", "њЊ", "ћЋ", "џЏ",
59 "ѡѠ", "ѣѢ", "ѥѤ", "ѧѦ", "ѩѨ", "ѫѪ", "ѭѬ", "ѯѮ", "ѱѰ", "ѳѲ", "ѵѴѶѷ", "ѹѸ", "ѻѺ", "ѽѼ", "ѿѾ",
60 "ҁҀ", "ҋҊ", "ҍҌ", "ҏҎ", "ґҐ", "ғҒ", "ҕҔ", "җҖ", "ҙҘ", "қҚ", "ҝҜ", "ҟҞ", "ҡҠ", "ңҢ", "ҥҤ",
61 "ҧҦ", "ҩҨ", "ҫҪ", "ҭҬ", "үҮ", "ұҰ", "ҳҲ", "ҵҴ", "ҷҶ", "ҹҸ", "һҺ", "ҽҼ", "ҿҾ", "ӄӃ", "ӆӅ",
62 "ӈӇ", "ӊӉ", "ӌӋ", "ӎӍ", "ӏӀ", "ӕӔ", "әӘӚӛ", "ӡӠ", "өӨӪӫ", "ӷӶ", "ӻӺ", "ӽӼ"};
63
64 const size_t kNumGroups = sizeof(kGroups) / sizeof(kGroups[0]);
65
66 // All of the characters that are normalized have codepoints of < 0x500.
67 const size_t kTableSize = 0x500;
68
CreateTable()69 const uint16_t* CreateTable() {
70 static uint16_t table[kTableSize];
71 for (size_t i = 0; i < kTableSize; i++) {
72 table[i] = i;
73 }
74 for (size_t i = 0; i < kNumGroups; i++) {
75 std::u32string group = Utf8ToUtf32(kGroups[i]);
76 for (size_t j = 0; j < group.length(); j++) {
77 table[group[j]] = group[0];
78 }
79 }
80 return table;
81 }
82
83 } // namespace
84
NormalizeForSearch(char32_t codepoint)85 char32_t NormalizeForSearch(char32_t codepoint) {
86 // Table is created on first use and cached.
87 static const uint16_t* table = CreateTable();
88 if (codepoint < kTableSize) {
89 return table[codepoint];
90 }
91 return codepoint;
92 }
93
BothAreSpaces(char32_t left_codepoint,char32_t right_codepoint)94 bool BothAreSpaces(char32_t left_codepoint, char32_t right_codepoint) {
95 return left_codepoint == '\x20' && right_codepoint == '\x20';
96 }
97
NormalizeStringForSearch(std::u32string * utf32)98 void NormalizeStringForSearch(std::u32string* utf32) {
99 std::transform(utf32->begin(), utf32->end(), utf32->begin(), NormalizeForSearch);
100 // Collapse repeated whitespace into a single space:
101 utf32->erase(std::unique(utf32->begin(), utf32->end(), BothAreSpaces), utf32->end());
102 }
103
IsSkippableForSearch(char32_t codepoint,char32_t prev_codepoint)104 bool IsSkippableForSearch(char32_t codepoint, char32_t prev_codepoint) {
105 if (codepoint == kBrokenWordMarker) {
106 // This can be skipped so words can be found when broken onto two lines.
107 return true;
108 }
109 if (BothAreSpaces(NormalizeForSearch(codepoint), NormalizeForSearch(prev_codepoint))) {
110 // Repeated whitespace can be skipped so that all whitespace is equivalent.
111 return true;
112 }
113 return false;
114 }
115
IsLineBreak(char32_t codepoint)116 bool IsLineBreak(char32_t codepoint) {
117 switch (codepoint) {
118 case kBrokenWordMarker:
119 case kLineFeed:
120 return true;
121 default:
122 return false;
123 }
124 }
125
IsWordBreak(char32_t codepoint)126 bool IsWordBreak(char32_t codepoint) {
127 char32_t normalized = NormalizeForSearch(codepoint);
128 return normalized == ' ' || normalized == '-';
129 }
130
AppendpdfClientCodepointAsUtf8(char32_t codepoint,std::string * output)131 void AppendpdfClientCodepointAsUtf8(char32_t codepoint, std::string* output) {
132 if (codepoint == kBrokenWordMarker) {
133 output->append("-\r\n"); // We give the user what the text looks like.
134 } else {
135 AppendCodepointAsUtf8(codepoint, output);
136 }
137 }
138
139 } // namespace pdfClient