# 要改的基础参数就是扫描文件路径+扫描的开始和结束页(结束页不必特意+1)

import os
import time
import base64
import urllib
import requests
import threading
from concurrent.futures import ThreadPoolExecutor
from queue import Queue

# 所有参数初始化
# 扫描文件路径
Path = "/Users/wangxijing/PycharmProjects/PythonProject1/神经外科学分册-145-223.pdf"
OutputFileName = os.path.splitext(Path)[0] + ".txt"
# 扫描的开始和结束页(结束页不必特意+1)
StartPage = 1
EndPage = 79
# 近期谁在变化
RecentChange = 0
# 章节名字
MajorChapterName = ""
MinorChapterName = ""
# 全局队列，用于按顺序收集结果
OrderedResults = Queue()
# 使用 AK，SK 生成鉴权签名(Access Token), return: access_token，或是None(如果错误)
APIKEY = "aUkDaYwGXoleRYvCeieUh6L0"
SECRETKEY = "kKh6FwvD9KY0WroJPKjkjooPhUx0fOFi"

# 1. 获取Access Token
def get_access_token():
    url = "https://aip.baidubce.com/oauth/2.0/token"
    params = {"grant_type": "client_credentials", "client_id": APIKEY, "client_secret": SECRETKEY}
    return str(requests.post(url, params=params).json().get("access_token"))

#2. 读取书籍, path: 文件路径, urlencoded: 是否对结果进行urlencoded, base64编码信息
def get_file_content_as_base64(path, urlencoded=False):
    with open(path, "rb") as f:
        content = base64.b64encode(f.read()).decode("utf8")
        if urlencoded:
            content = urllib.parse.quote_plus(content)
    return content

# 3. 调用OCR接口进行扫描(多线程)
def process_page(PageNumber):
    PdfFile = get_file_content_as_base64(Path, True)
    url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate?access_token=" + get_access_token()
    payload = f"pdf_file={PdfFile}&pdf_file_num={PageNumber}&recognize_granularity=big&detect_direction=false&vertexes_location=false&paragraph=false&probability=false&char_probability=false&multidirectional_recognize=false"
    headers = {
        "Content-Type": "application/x-www-form-urlencoded",
        "Accept": "application/json"
    }

    # 先导出全部
    PreResult = (requests.request("POST", url, headers=headers, data=payload.encode("utf-8"))).json()
    Result = PreResult.get("words_result", [])

    # 如果top参数小于50, 就删除相关list
    return [item for item in Result if item["location"]["top"] >= 50]

# 4. 将扫描完的数据加入序列
def producer(PageNumber):
    Data = process_page(PageNumber)
    OrderedResults.put(((PageNumber, Data)))

# 5. 处理排版(单线程)
def process_text(PageData):
    # 处理单页文本并写入文件
    global MajorChapterName, MinorChapterName, RecentChange

    # 开始对每一页进行加载
    for item in PageData:
        # 更正一些翻译错误
        while "," in item["words"]:
            item["words"] = item["words"].replace(",", "，")

        # 先寻找疾病名字, 根据 第_章 判断, 注意大部分疾病是单独成章
        if "章" in item["words"] and item["words"].startswith("第"):
            MajorChapterName = item["words"].split("章")[1]
            # 为了考虑到大病中的小病的介绍, 引入RecentChange, 当RecentChange == 0时, 只写大病
            # 当RecentChange == 1时, 大小病一块写
            RecentChange = 0
            # 开新的一段
            item["words"] = "\n\n" + item["words"] + "\n"

            # 先寻找疾病名字，根据 第_节 判断, 注意小部分疾病是单独成节
        if "节" in item["words"] and item["words"].startswith("第"):
            MinorChapterName = item["words"].split("节")[1]
            # 为了考虑到大病中的小病的介绍, 引入RecentChange, 当RecentChange == 0时, 只写大病
            # 当RecentChange == 1时, 大小病一块写
            RecentChange = 1
            # 开新的一段
            item["words"] = "\n\n" + item["words"] + "\n"

        # 将疾病名字插入每一个描述之前
        if item["words"].startswith("【"):
            # 为了考虑到大病中的小病的介绍, 引入RecentChange, 当RecentChange == 0时, 只写大病
            # 当RecentChange == 1时, 大小病一块写
            if RecentChange == 0:
                item["words"] = f"\n【{MajorChapterName}的{item["words"][1:]}"
            if RecentChange == 1:
                item["words"] = f"\n【{MajorChapterName}（{MinorChapterName}）的{item["words"][1:]}"

        # 分段, 如果满足分段条件就开新的一段
        if item["words"].startswith("（") or "." == item["words"][2:3] or "." == item["words"][1:2]:
            item["words"] = "\n" + item["words"]




    combined_text = "".join([item["words"] for item in PageData])
    with open(OutputFileName, "a", encoding="utf-8") as f:
        f.write(combined_text)



# 按顺序处理排版
def consumer():
    CurrentPage = StartPage
    while CurrentPage <= EndPage:
        PageNumber, Data = OrderedResults.get()

        # 检测是否是当前需要的页码
        if PageNumber != CurrentPage:
            OrderedResults.put((PageNumber, Data))
            time.sleep(0.2)
            continue
        else:
            process_text(Data)
            time.sleep(0.2)
            CurrentPage += 1

# 正式开始输出
if __name__ == '__main__':

    # 输出时间
    LocalTime = time.localtime()
    formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", LocalTime)
    with open(OutputFileName, "w", encoding="utf-8") as f:
        f.write(formatted_time + "导出\n")

    ConsumerThread = threading.Thread(target=consumer)
    ConsumerThread.start()
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(producer, range(StartPage, EndPage + 1))
    ConsumerThread.join()