1 /*
2  * Copyright (C) 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utf.h"
18 
19 #include <android-base/logging.h>
20 
21 #include <algorithm>
22 #include <iterator>
23 #include <string>
24 #include <vector>
25 
26 #include "../unchecked.h"
27 #include "byte_value.h"
28 #include "fpdfview.h"
29 
30 namespace pdfClient_utils {
31 namespace {
32 typedef char16_t value_t;
33 constexpr size_t kValueBytes = sizeof(value_t);
34 constexpr value_t kLeadSurrogateMin = 0xD800u;
35 constexpr value_t kLeadSurrogateMax = 0xDBFFu;
36 
IsLeadingSurrogate(value_t code_point)37 bool IsLeadingSurrogate(value_t code_point) {
38     return code_point >= kLeadSurrogateMin && code_point <= kLeadSurrogateMax;
39 }
40 }  // namespace
41 
42 template <class T>
GetUtf8Result(const std::function<size_t (T *,size_t)> & f)43 std::string GetUtf8Result(const std::function<size_t(T*, size_t)>& f) {
44     std::vector<char> buffer;
45     GetBytes<T>(&buffer, f);
46 
47     size_t result_size = buffer.size();
48     DCHECK_EQ(result_size % kValueBytes, 0)
49             << "Pdfium function should always return an even number of bytes.";
50 
51     value_t* start = reinterpret_cast<value_t*>(buffer.data());
52     value_t* end = reinterpret_cast<value_t*>(buffer.data() + result_size);
53     // Remove null terminators if there are any.
54     while (start != end && *(end - 1) == 0) --end;
55 
56     // If the last UTF-16 character is a leading surrogate, UTF8-CPP will fail to
57     // properly check the boundary and go off the end of the buffer. Since leading
58     // surrogates not followed by a trailing surrogate are invalid UTF-16 anyway,
59     // just remove them.
60     while (start != end && IsLeadingSurrogate(*(end - 1))) {
61         --end;
62     }
63 
64     std::string result;
65     pdfClient::unchecked::utf16to8(start, end, std::back_inserter(result));
66     return result;
67 }
68 
69 // Instantiate all known template specializations
70 template std::string GetUtf8Result<void>(const std::function<size_t(void*, size_t)>& f);
71 template std::string GetUtf8Result<FPDF_WCHAR>(const std::function<size_t(FPDF_WCHAR*, size_t)>& f);
72 
Utf8ToUtf16Le(std::string_view utf8)73 std::u16string Utf8ToUtf16Le(std::string_view utf8) {
74     std::u16string result;
75     pdfClient::unchecked::utf8to16(utf8.begin(), utf8.end(), std::back_inserter(result));
76 #ifdef IS_BIG_ENDIAN
77     // Convert from big-endian to little-endian.
78     std::transform(result.begin(), result.end(), result.begin(), &LittleEndian::FromHost16);
79 #endif
80     return result;
81 }
82 
83 }  // namespace pdfClient_utils