import json, random, re
import multiprocessing, math
import os
import hashlib
from pathlib import Path
import emoji
from datetime import datetime, timedelta
import tiktoken
import shutil


def truncate_list_by_token_count(text_list, max_tokens=128 * 1024, model_name="gpt-3.5-turbo"):
    """
    截断文本列表，使其总 token 数不超过 max_tokens。

    参数:
        text_list (List[str]): 要处理的文本列表。
        max_tokens (int): 最大 token 数，默认 128k（131072）。
        model_name (str): 用于 token 计算的模型名。

    返回:
        List[str]: 截断后的文本列表。
    """
    encoding = tiktoken.encoding_for_model(model_name)
    total_tokens = 0
    truncated_list = []

    for text in text_list:
        tokens = encoding.encode(text)
        token_count = len(tokens)
        if total_tokens + token_count > max_tokens:
            break
        truncated_list.append(text)
        total_tokens += token_count
    return truncated_list


def extract_emojis_list(text):
    return [char for char in text if char in emoji.EMOJI_DATA]


def is_ad_text(text):
    text = text.strip()
    if len(extract_emojis_list(text)) >= 3:
        return True
    elif len(extract_emojis_list(text)) >= 2 and text.count('\\n\\n') >= 1:
        return True
    elif len(extract_emojis_list(text)) >= 1 and text.count('\\n\\n') >= 1 and text.strip().count('\\n') >= 1:
        return True
    elif len(extract_emojis_list(text)) >= 1 and text.count('\\n\\n') >= 2:
        return True
    elif len(extract_emojis_list(text)) >= 1 and text.count('\\n') >= 2:
        return True
    elif text.strip().count('\\n') >= 2 and text.count('\\n\\n') >= 2:
        return True
    elif re.search(r'亲爱的家长朋友|尊敬的银河|隆重推出|好礼相送|推荐奖金|银河精选要闻', text):
        return True
    else:
        return False


def get_md5_value(string: str):
    md5 = hashlib.md5()
    md5.update(string.encode('utf-8'))
    md5_value = md5.hexdigest()
    return md5_value


# s = get_md5_value(string="https://prod-cdn-pub.galaxy-immi.com/production/ultron/ultron_phone/audio/3817385431338117814120250329160004.mp3")
# print(s)

def saler_customer_dialog_format(input_sub_list, output_file, lock, ):
    for index, customer_path_file in input_sub_list:
        result_list = []
        with open(customer_path_file, 'r', encoding='utf-8') as f1:
            # 客户id
            customer_id = os.path.basename(customer_path_file).replace('customer_id_', '').replace('.json', '')
            data = f1.read()
            dialog_dict_list = json.loads(data)
            if len(dialog_dict_list) == 0:
                return result_list
            dialog_dict_list = sorted(dialog_dict_list,
                                      key=lambda x: datetime.strptime(x['msg_time'], '%Y-%m-%d %H:%M:%S'))
            # 默认初始值
            beisen_depart_name_list = []
            try:
                beisen_depart_name = [dialog_dict['beisen_depart_name'] for dialog_dict in dialog_dict_list if
                                      dialog_dict['beisen_depart_name'] not in [None, '']][0]
            except:
                beisen_depart_name = '销售组'
            for dialog_dict in dialog_dict_list:
                if dialog_dict['beisen_depart_name'] not in [None, '']:
                    beisen_depart_name_list.append(dialog_dict['beisen_depart_name'])
                # print(dialog_dict)
                if dialog_dict['msg_type'] in ('image'):
                    content = '相关图片'
                elif dialog_dict['msg_type'] in ('video'):
                    content = '相关视频'
                elif dialog_dict['msg_type'] in ('file'):
                    content = '相关文件'
                elif dialog_dict['msg_type'] in ('link'):
                    content = '相关链接'
                elif dialog_dict['msg_type'] in ('emotion'):
                    content = '相关表情'
                else:
                    content = repr(dialog_dict['content']).strip("'").strip('"')
                if content == "None":
                    continue
                # if dialog_dict['msg_time'] == '2025-04-27 10:34:59':
                #     print(content)
                if dialog_dict['msg_type'] in ('voice', 'meeting_voice_call', 'phone'):
                    hash_id = get_md5_value(content)
                    # print("hash_id:",hash_id,"voice_url:", content)
                    tmp_hash_file = os.path.join(customer_path_file.replace(os.path.basename(customer_path_file), ''),
                                                 f'audio_{hash_id}.json')
                    tmp_result_list = []
                    tmp_result_list_1 = []
                    if os.path.isfile(tmp_hash_file):  # 已识别音频文件
                        # 由于语音没有客服名称，因此取最近的客服名称
                        if len(beisen_depart_name_list) > 0:
                            beisen_depart_name = beisen_depart_name_list[-1]
                        with open(tmp_hash_file, 'r', encoding='utf-8') as f2:
                            tmp_data = f2.read()
                            try:
                                tmp_dict_list = json.loads(tmp_data)
                            except:
                                tmp_dict_list = None
                            if dialog_dict['msg_sender_type_id'] == 0:
                                # if dialog_dict['msg_sender_type_id'] == 0 and dialog_dict['msg_time'] == '2025-04-27 10:34:59':
                                if tmp_dict_list != None:
                                    for sentence_dirc in tmp_dict_list['Result']['Sentences']:
                                        tmp_BeginTime = sentence_dirc['BeginTime']
                                        tmp_ChannelId = sentence_dirc['ChannelId']
                                        tmp_Text = sentence_dirc['Text']
                                        msg_time = (datetime.strptime(dialog_dict['msg_time'],
                                                                      "%Y-%m-%d %H:%M:%S") + timedelta(
                                            milliseconds=tmp_BeginTime)).strftime("%Y-%m-%d %H:%M:%S")
                                        # --------------------语音通话中销售身份和客户身份直接使用"销售组与客户(id)"的形式
                                        tmp_result_list_1.append(
                                            f"""{msg_time} {beisen_depart_name}与客户({customer_id})-语音通话：{tmp_Text}""")
                                        # --------------------使用GPT判断语音通话中销售身份和客户身份
                                        # tmp_result_list.append(f"""{msg_time} 身份{tmp_ChannelId}：{tmp_Text}""")
                                    # tmp_context = '\n'.join(tmp_result_list[:50])
                                    # prompt = f"""
                                    #             ```
                                    #             {tmp_context}
                                    #             ```
                                    #             请将上述```中的内容严格按照如下要求处理：
                                    #             判断各身份的归属到："{beisen_depart_name}-{dialog_dict['msg_category']}"或"客户({customer_id})-{dialog_dict['msg_category']}"
                                    #             例如：{{"身份0或身份1":"{beisen_depart_name}-{dialog_dict['msg_category']}","身份0或身份1":"客户({customer_id})-{dialog_dict['msg_category']}"}}
                                    #             补充：如果是银河的人员，则归属到"{beisen_depart_name}-{dialog_dict['msg_category']}"
                                    #             严格按照"{{"":"","":"",...}}"的格式输出,拒绝```json```
                                    #             """
                                    # is_try = True
                                    # try_n = 1
                                    # while is_try:
                                    #     try:
                                    #         identity_reco = get_online_llm_response(prompt)
                                    #         # print(identity_reco)
                                    #         identity_reco_dict = json.loads(identity_reco)
                                    #         # print(identity_reco_dict)
                                    #         is_try = False
                                    #     except Exception as e:
                                    #         try_n += 1
                                    #         if try_n > 2:  # gpt调用有两次机会
                                    #             is_try = False
                                    #             print(e)
                                    #             pass
                                    # for line in tmp_result_list:
                                    #     # print(line)
                                    #     for k, v in identity_reco_dict.items():
                                    #         line = line.replace(k, v)
                                    #     # print(line)
                                    #     tmp_result_list_1.append(line)
                            elif dialog_dict['msg_sender_type_id'] == 1:  # 客户语音
                                if tmp_dict_list != None:
                                    # print(tmp_dict_list)
                                    # BizDuration = tmp_dict_list['BizDuration']
                                    for sentence_dirc in tmp_dict_list['Result']['Sentences']:
                                        tmp_BeginTime = sentence_dirc['BeginTime']
                                        tmp_ChannelId = sentence_dirc['ChannelId']
                                        tmp_Text = sentence_dirc['Text']
                                        msg_time = (datetime.strptime(dialog_dict['msg_time'],
                                                                      "%Y-%m-%d %H:%M:%S") + timedelta(
                                            milliseconds=tmp_BeginTime)).strftime("%Y-%m-%d %H:%M:%S")

                                        if dialog_dict['msg_type'] == 'meeting_voice_call':
                                            tmp_result_list_1.append(
                                                f"""{msg_time} {beisen_depart_name}与客户({customer_id})-语音通话：{tmp_Text}""")
                                        else:
                                            tmp_result_list_1.append(
                                                f"""{msg_time} 客户({customer_id})-{dialog_dict['msg_category']}：{tmp_Text}""")
                            elif dialog_dict['msg_sender_type_id'] == 2:  # 销售语音
                                if tmp_dict_list != None:
                                    # print(tmp_dict_list)
                                    # BizDuration = tmp_dict_list['BizDuration']
                                    for sentence_dirc in tmp_dict_list['Result']['Sentences']:
                                        tmp_BeginTime = sentence_dirc['BeginTime']
                                        tmp_ChannelId = sentence_dirc['ChannelId']
                                        tmp_Text = sentence_dirc['Text']
                                        msg_time = (datetime.strptime(dialog_dict['msg_time'],
                                                                      "%Y-%m-%d %H:%M:%S") + timedelta(
                                            milliseconds=tmp_BeginTime)).strftime("%Y-%m-%d %H:%M:%S")
                                        if dialog_dict['msg_type'] == 'meeting_voice_call':
                                            tmp_result_list_1.append(
                                                f"""{msg_time} {beisen_depart_name}与客户({customer_id})-语音通话：{tmp_Text}""")
                                        else:
                                            if dialog_dict['beisen_depart_name'] not in [None, '']:
                                                tmp_result_list_1.append(
                                                    f"""{msg_time} {dialog_dict['beisen_depart_name']}-{dialog_dict['msg_category']}：{tmp_Text}""")
                                            else:
                                                tmp_result_list_1.append(
                                                    f"""{msg_time} {beisen_depart_name}-{dialog_dict['msg_category']}：{tmp_Text}""")

                    else:  # 未识别音频文件的情况
                        print(f'客户({customer_id})：音频文件不存在:{tmp_hash_file} 内容：{content}')
                        # 输出结果：
                        if dialog_dict['beisen_depart_name'] in [None, '']:
                            tmp_result_list_1.append(
                                f"""{dialog_dict['msg_time']} 销售组-客户({customer_id})-{dialog_dict['msg_category']}：相关通话""")
                            # print(f"""{dialog_dict['msg_time']} {beisen_depart_name}-客户({customer_id})-{dialog_dict['msg_category']}：{content}""")
                        else:
                            tmp_result_list_1.append(
                                f"""{dialog_dict['msg_time']} {dialog_dict['beisen_depart_name']}-客户({customer_id})-{dialog_dict['msg_category']}：相关通话""")
                            # print(f"""{dialog_dict['msg_time']} {dialog_dict['beisen_depart_name']}-客户({customer_id})-{dialog_dict['msg_category']}：{content}""")
                    result_list.extend(tmp_result_list_1)
                    # break
                else:  # 非语音的对话输出
                    if dialog_dict['msg_sender_type_id'] == 1:  # 客户
                        result_list.append(
                            f"""{dialog_dict['msg_time']} 客户({customer_id})-{dialog_dict['msg_category']}：{content}""")
                    elif dialog_dict['msg_sender_type_id'] == 2:  # 销售
                        if dialog_dict['beisen_depart_name'] in [None, '']:
                            result_list.append(
                                f"""{dialog_dict['msg_time']} 销售组-{dialog_dict['msg_category']}：{content}""")
                            # print(f"""{dialog_dict['msg_time']} 销售组-{dialog_dict['msg_category']}：{content}""")
                        else:
                            result_list.append(
                                f"""{dialog_dict['msg_time']} {dialog_dict['beisen_depart_name']}-{dialog_dict['msg_category']}：{content}""")
                            # print(f"""{dialog_dict['msg_time']} {dialog_dict['beisen_depart_name']}-{dialog_dict['msg_category']}：{content}""")
        lock.acquire()  # 添加锁
        try:
            with open(output_file, 'a', encoding='utf-8') as f:
                f.write(f'对话组：{index}' + "\n")
                print(f'对话组：{index}', customer_path_file)
                # print('\n'.join(result_list))
                # f.write(mask_phone_email(text='\n'.join(result_list)) + "\n")
                f.write('\n'.join(result_list) + "\n")
        finally:
            lock.release()  # 释放锁


def multi_saler_customer_format(input_list, output_file, group_index=0, group_size=5, num_processes=5):
    with open(output_file, 'w', encoding='utf-8') as f:
        pass
    # 创建进程锁
    lock = multiprocessing.Lock()
    num_groups = math.ceil(len(input_list) / group_size)
    print(f"总共划分的组数为: {num_groups}")
    processes = []
    while group_index < num_groups:
        # 确定当前可用的进程数量，取剩余分组数和设定进程数的较小值
        available_processes = min(num_processes, num_groups - group_index)
        print("当前第%s分组" % group_index, f"总共划分的组数为: {num_groups}")
        for _ in range(available_processes):
            start_index = group_index * group_size
            end_index = min((group_index + 1) * group_size, len(input_list))
            print(f"当前分组的索引范围: {start_index} - {end_index}")
            input_sub_list = input_list[start_index:end_index]
            if len(input_sub_list) > 0:
                p = multiprocessing.Process(target=saler_customer_dialog_format,
                                            args=(input_sub_list, output_file, lock,))
                processes.append(p)
                p.start()
            group_index += 1
        # 等待当前一批次启动的进程执行完毕
        for p in processes[-available_processes:]:
            p.join()
        # 移除已完成的进程对象，避免内存占用过多
        processes = processes[:-available_processes]


def get_dialog_to_load(parentdir):
    # 1.数据清洗落地
    subdirs = [os.path.join(parentdir, f.name) for f in Path(parentdir).iterdir() if f.is_dir()]  # [:1]
    if len(subdirs) == 0:
        subdirs = [parentdir]

    for subdir in subdirs:
        outputfile = os.path.join(subdir, Path(subdir).name + '.txt')
        matched_files = [os.path.join(subdir, f) for f in os.listdir(subdir) if
                         re.search(r'customer_id', f)]
        matched_files = list(enumerate(matched_files, start=1))
        multi_saler_customer_format(matched_files, outputfile, group_index=0, group_size=5, num_processes=5)


def load_dialog_to_merge(data_parent_dir, data_file_dt, start_dt='', end_dt=''):
    # 2.数据整合落地
    output_file = os.path.join(data_parent_dir, Path(data_parent_dir).name + '.txt')
    data_sub_dir_list = [os.path.join(data_parent_dir, f.name) for f in Path(data_parent_dir).iterdir() if f.is_dir()]
    if len(data_sub_dir_list) == 0:
        data_sub_dir_list_file = [os.path.join(data_parent_dir, Path(data_parent_dir).name + '.txt')]
    else:
        data_sub_dir_list_file = [os.path.join(data_parent_dir, f.name, f.name + '.txt') for f in
                                  Path(data_parent_dir).iterdir() if f.is_dir()]
    print('整合的子文件有：\n', '\n'.join(data_sub_dir_list_file))
    tmp_content_list = []
    for file in data_sub_dir_list_file:
        with open(file, 'r', encoding='utf-8') as infile:
            contents = infile.readlines()
            for content in contents:
                tmp_content = ''
                if '对话组' in content:
                    tmp_content = content
                    # tmp_content_list.append(content)
                else:
                    try:
                        log_date = datetime.strptime(content.strip()[:10], "%Y-%m-%d")
                        if start_dt == '':
                            start_date = datetime.strptime('2000-01-01', "%Y-%m-%d")
                        else:
                            start_date = datetime.strptime(start_dt, "%Y-%m-%d")
                        if end_dt == '':
                            end_date = datetime.strptime('2999-01-01', "%Y-%m-%d")
                        else:
                            end_date = datetime.strptime(end_dt, "%Y-%m-%d")
                        if start_date <= log_date <= end_date:
                            tmp_content = content
                            # tmp_content_list.append(content)
                    except Exception as e:
                        print(f'err content:{content}', e)
                if tmp_content != '':
                    tmp_content_list.append(tmp_content)
        print('已合并数据文件:', file)
    if start_dt == '' and end_dt == '':
        tmp_str = ''
    else:
        tmp_str = f'({start_dt}_{end_dt})'

    tmp_content_list_1 = tmp_content_list
    # 按每天的聊天记录切分对话内容
    # tmp_content_list_1 = []
    # tmp_day = ''
    # for index, content in enumerate(tmp_content_list[::-1]):
    #     dt = content.strip()[:10]
    #     if dt != tmp_day and tmp_day != '' and tmp_day.startswith(('20')):
    #         # print('curr:',tmp_day,'next',dt)
    #         tmp_content_list_1.append(tmp_day + '\n')
    #     tmp_day = dt
    #     # print(index, content)
    #     tmp_content_list_1.append(content)
    # tmp_content_list_1.reverse()

    # 剔除有广告没有客户回复的日对话
    del_day_list = []
    tmp_prefix_content_dic = {}
    group_index = 1
    for content in tmp_content_list_1:
        if '对话组' in content:
            tmp_pre = f'对话组：{group_index}\n'
            tmp_prefix_content_dic[tmp_pre] = [tmp_pre]
            group_index += 1
        else:
            if content.strip()[:10] + tmp_pre not in tmp_prefix_content_dic:
                tmp_prefix_content_dic[content.strip()[:10] + tmp_pre] = [content]
            else:
                tmp_prefix_content_dic[content.strip()[:10] + tmp_pre].append(content)
    for prefix, content_list in tmp_prefix_content_dic.items():
        if not re.search(r"客户[(]\d+[)]", ''.join(content_list)) and prefix.startswith(('20')):
            del_day_list.append(prefix)
    for key in del_day_list:
        tmp_prefix_content_dic.pop(key, None)
    # 按天的内容切分对话
    # split_dialog_num = 5
    # for prefix,content_list in tmp_prefix_content_dic.items():
    #     tmp_list = []
    #     tmp_list.append(content_list[0])
    #     part_index = 1
    #     for i in range(1,len(content_list),split_dialog_num):
    #         if len(content_list[i:i + split_dialog_num]) >0:
    #             tmp_strs = ''.join(content_list[i:i + split_dialog_num])
    #             tmp_strs = f'part{part_index}\n'+tmp_strs
    #             tmp_list.append(tmp_strs)
    #             part_index +=1
    #     tmp_prefix_content_dic[prefix] = tmp_list

    tmp_content_list_1 = [content for content_list in tmp_prefix_content_dic.values() for content in content_list]

    # 控制每个对话组的对话总量,如果对话组没有对话则删除该组
    tmp_group_content_list_dic = {}
    for content in tmp_content_list_1:
        if '对话组' in content:
            tmp_strs = content
            tmp_group_content_list_dic[tmp_strs] = [content + f'round1:{end_dt}\n']
        else:
            if tmp_strs in tmp_group_content_list_dic:
                tmp_group_content_list_dic[tmp_strs].append(content)
    del_group_list = []
    for group, content_list in tmp_group_content_list_dic.items():
        content_list = truncate_list_by_token_count(content_list, max_tokens=100 * 1024, model_name="gpt-3.5-turbo")
        tmp_group_content_list_dic[group] = content_list
        if len(content_list) <= 10:  # 过滤出聊天量小于N条的当天内容
            del_group_list.append(group)
    for group in del_group_list:
        tmp_group_content_list_dic.pop(group, None)

    tmp_content_list_1 = [content for content_list in tmp_group_content_list_dic.values() for content in content_list]
    # 按天节点添加对话轮数
    tmp_content_list_2 = []
    day_index = 1
    for content in tmp_content_list_1:
        if len(content.strip()) == 10 and content.strip().startswith(('20')):
            content = f'round{day_index}:{content}'
            day_index += 1
        if content.startswith(('对话组')):
            day_index = 1
        tmp_content_list_2.append(content)
    # 结果保存本地
    load_path_file = output_file.split('.txt')[0] + tmp_str + '.txt'
    with open(load_path_file, 'w', encoding='utf-8') as outfile:
        for content in tmp_content_list_2:
            outfile.write(content)
        print('合并后输出数据文件：', load_path_file)
    # 将保存的本地文件移动到目标目录
    if data_file_dt == '':
        target_dir = f'dialog_source_data/{datetime.today().strftime('%Y%m%d')}'
    else:
        target_dir = f'dialog_source_data/{data_file_dt}'
    os.makedirs(target_dir, exist_ok=True)
    shutil.move(load_path_file, os.path.join(target_dir, 'source.md'))
    print('目标数据文件转入到：', os.path.join(target_dir, 'source.md'))


def extract_dialogue(customer_id, text, index_str='对话组：'):
    matches = list(re.finditer(rf'{index_str}.*?(?={index_str}|$)', text, re.DOTALL))
    for m in matches:
        if customer_id in m.group():
            return m.group()
    return None


def get_customer_id_dialog(customer_id, input_file):
    with open(input_file, 'r', encoding='utf-8') as infile:
        content = infile.read()
        result = extract_dialogue(customer_id=customer_id, text=content)
        print(result)


if __name__ == '__main__':
    # data_parent_dir = r'D:\Downloads\对客销售\对话语料\20250612-119未复购客户'
    # get_dialog_to_load(data_parent_dir)
    # load_dialog_to_merge(data_parent_dir, start_dt=f'{(datetime.today() - timedelta(days=60)).strftime('%Y-%m-%d')}', end_dt=f'{(datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')}')
    get_customer_id_dialog(customer_id='914108', input_file='./dialog_source_data/20250626/source.md')
