1 /*
2 * Copyright (C) 2024 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utf.h"
18
19 #include <android-base/logging.h>
20
21 #include <algorithm>
22 #include <iterator>
23 #include <string>
24 #include <vector>
25
26 #include "../unchecked.h"
27 #include "byte_value.h"
28 #include "fpdfview.h"
29
30 namespace pdfClient_utils {
31 namespace {
32 typedef char16_t value_t;
33 constexpr size_t kValueBytes = sizeof(value_t);
34 constexpr value_t kLeadSurrogateMin = 0xD800u;
35 constexpr value_t kLeadSurrogateMax = 0xDBFFu;
36
IsLeadingSurrogate(value_t code_point)37 bool IsLeadingSurrogate(value_t code_point) {
38 return code_point >= kLeadSurrogateMin && code_point <= kLeadSurrogateMax;
39 }
40 } // namespace
41
42 template <class T>
GetUtf8Result(const std::function<size_t (T *,size_t)> & f)43 std::string GetUtf8Result(const std::function<size_t(T*, size_t)>& f) {
44 std::vector<char> buffer;
45 GetBytes<T>(&buffer, f);
46
47 size_t result_size = buffer.size();
48 DCHECK_EQ(result_size % kValueBytes, 0)
49 << "Pdfium function should always return an even number of bytes.";
50
51 value_t* start = reinterpret_cast<value_t*>(buffer.data());
52 value_t* end = reinterpret_cast<value_t*>(buffer.data() + result_size);
53 // Remove null terminators if there are any.
54 while (start != end && *(end - 1) == 0) --end;
55
56 // If the last UTF-16 character is a leading surrogate, UTF8-CPP will fail to
57 // properly check the boundary and go off the end of the buffer. Since leading
58 // surrogates not followed by a trailing surrogate are invalid UTF-16 anyway,
59 // just remove them.
60 while (start != end && IsLeadingSurrogate(*(end - 1))) {
61 --end;
62 }
63
64 std::string result;
65 pdfClient::unchecked::utf16to8(start, end, std::back_inserter(result));
66 return result;
67 }
68
69 // Instantiate all known template specializations
70 template std::string GetUtf8Result<void>(const std::function<size_t(void*, size_t)>& f);
71 template std::string GetUtf8Result<FPDF_WCHAR>(const std::function<size_t(FPDF_WCHAR*, size_t)>& f);
72
Utf8ToUtf16Le(std::string_view utf8)73 std::u16string Utf8ToUtf16Le(std::string_view utf8) {
74 std::u16string result;
75 pdfClient::unchecked::utf8to16(utf8.begin(), utf8.end(), std::back_inserter(result));
76 #ifdef IS_BIG_ENDIAN
77 // Convert from big-endian to little-endian.
78 std::transform(result.begin(), result.end(), result.begin(), &LittleEndian::FromHost16);
79 #endif
80 return result;
81 }
82
83 } // namespace pdfClient_utils