aboutsummaryrefslogtreecommitdiff
path: root/core/fpdftext/cpdf_textpage.h
blob: 3ea13805167b238f7b28fc5870b71391a2a3f171 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
// Copyright 2016 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

#ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
#define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_

#include <deque>
#include <functional>
#include <vector>

#include "core/fpdfapi/page/cpdf_pageobjectholder.h"
#include "core/fxcrt/cfx_widetextbuf.h"
#include "core/fxcrt/fx_coordinates.h"
#include "core/fxcrt/fx_string.h"
#include "core/fxcrt/unowned_ptr.h"
#include "third_party/base/optional.h"

class CPDF_Font;
class CPDF_FormObject;
class CPDF_Page;
class CPDF_TextObject;

#define TEXT_SPACE_CHAR L' '
#define TEXT_LINEFEED_CHAR L'\n'
#define TEXT_RETURN_CHAR L'\r'
#define TEXT_HYPHEN_CHAR L'-'
#define TEXT_HYPHEN L"-"
#define TEXT_CHARRATIO_GAPDELTA 0.070

struct PDFTEXT_Obj {
  PDFTEXT_Obj();
  PDFTEXT_Obj(const PDFTEXT_Obj& that);
  ~PDFTEXT_Obj();

  UnownedPtr<CPDF_TextObject> m_pTextObj;
  CFX_Matrix m_formMatrix;
};

class CPDF_TextPage {
 public:
  enum class CharType : uint8_t {
    kNormal,
    kGenerated,
    kNotUnicode,
    kHyphen,
    kPiece,
  };

  class CharInfo {
   public:
    CharInfo();
    CharInfo(const CharInfo&);
    ~CharInfo();

    int m_Index = 0;
    uint32_t m_CharCode = 0;
    wchar_t m_Unicode = 0;
    CharType m_CharType = CharType::kNormal;
    CFX_PointF m_Origin;
    CFX_FloatRect m_CharBox;
    UnownedPtr<CPDF_TextObject> m_pTextObj;
    CFX_Matrix m_Matrix;
  };

  CPDF_TextPage(const CPDF_Page* pPage, bool rtl);
  ~CPDF_TextPage();

  int CharIndexFromTextIndex(int text_index) const;
  int TextIndexFromCharIndex(int char_index) const;
  size_t size() const { return m_CharList.size(); }
  int CountChars() const;

  // These methods CHECK() to make sure |index| is within bounds.
  const CharInfo& GetCharInfo(size_t index) const;
  float GetCharFontSize(size_t index) const;

  std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
  int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
  WideString GetTextByRect(const CFX_FloatRect& rect) const;
  WideString GetTextByObject(const CPDF_TextObject* pTextObj) const;

  // Returns string with the text from |m_TextBuf| that are covered by the input
  // range. |start| and |count| are in terms of the |m_CharIndices|, so the
  // range will be converted into appropriate indices.
  WideString GetPageText(int start, int count) const;
  WideString GetAllPageText() const { return GetPageText(0, CountChars()); }

  int CountRects(int start, int nCount);
  bool GetRect(int rectIndex, CFX_FloatRect* pRect) const;

 private:
  enum class TextOrientation {
    kUnknown,
    kHorizontal,
    kVertical,
  };

  enum class GenerateCharacter {
    kNone,
    kSpace,
    kLineBreak,
    kHyphen,
  };

  enum class MarkedContentState { kPass = 0, kDone, kDelay };

  void Init();
  bool IsHyphen(wchar_t curChar) const;
  void ProcessObject();
  void ProcessFormObject(CPDF_FormObject* pFormObj,
                         const CFX_Matrix& formMatrix);
  void ProcessTextObject(PDFTEXT_Obj pObj);
  void ProcessTextObject(CPDF_TextObject* pTextObj,
                         const CFX_Matrix& formMatrix,
                         const CPDF_PageObjectHolder* pObjList,
                         CPDF_PageObjectHolder::const_iterator ObjPos);
  GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj,
                                        const CFX_Matrix& formMatrix);
  const CharInfo* GetPrevCharInfo() const;
  Optional<CharInfo> GenerateCharInfo(wchar_t unicode);
  bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
                             const CPDF_PageObjectHolder* pObjList,
                             CPDF_PageObjectHolder::const_iterator iter) const;
  bool IsSameTextObject(CPDF_TextObject* pTextObj1,
                        CPDF_TextObject* pTextObj2) const;
  void CloseTempLine();
  MarkedContentState PreMarkedContent(PDFTEXT_Obj pObj);
  void ProcessMarkedContent(PDFTEXT_Obj pObj);
  void FindPreviousTextObject();
  void AddCharInfoByLRDirection(wchar_t wChar, const CharInfo& info);
  void AddCharInfoByRLDirection(wchar_t wChar, const CharInfo& info);
  TextOrientation GetTextObjectWritingMode(
      const CPDF_TextObject* pTextObj) const;
  TextOrientation FindTextlineFlowOrientation() const;
  void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix);
  void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
  WideString GetTextByPredicate(
      const std::function<bool(const CharInfo&)>& predicate) const;

  UnownedPtr<const CPDF_Page> const m_pPage;
  std::vector<uint16_t> m_CharIndices;
  std::deque<CharInfo> m_CharList;
  std::deque<CharInfo> m_TempCharList;
  CFX_WideTextBuf m_TextBuf;
  CFX_WideTextBuf m_TempTextBuf;
  UnownedPtr<CPDF_TextObject> m_pPreTextObj;
  CFX_Matrix m_perMatrix;
  const bool m_rtl;
  const CFX_Matrix m_DisplayMatrix;
  std::vector<CFX_FloatRect> m_SelRects;
  std::vector<PDFTEXT_Obj> m_LineObj;
  TextOrientation m_TextlineDir = TextOrientation::kUnknown;
  CFX_FloatRect m_CurlineRect;
};

#endif  // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_