{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import AutoTokenizer" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] } ], "source": [ "tokenizer = AutoTokenizer.from_pretrained('/data3/public/model/Llama-3-Base-8B-SFT', use_fast = False)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'bos_token': '<|begin_of_text|>',\n", " 'eos_token': '<|end_of_text|>',\n", " 'pad_token': '<|end_of_text|>'}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.special_tokens_map" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'<|end_of_text|>'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.pad_token" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained('/data/public/model/llama-2-7b-hf', use_fast = False)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'bos_token': '',\n", " 'eos_token': '',\n", " 'unk_token': '',\n", " 'pad_token': ''}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.special_tokens_map" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def parse_conversation(text):\n", " # 用正则匹配 \"Human:\" 或 \"Assistant:\" 开头的段落\n", " parts = re.split(r'(?m)^(Human:|Assistant:)', text.strip())\n", " \n", " messages = []\n", " role_map = {\n", " \"Human:\": \"user\",\n", " \"Assistant:\": \"assistant\"\n", " }\n", " \n", " # parts 会变成 [\"\", \"Human:\", \"...内容...\", \"Assistant:\", \"...内容...\", \"Human:\", \"...\"]\n", " for i in range(1, len(parts), 2):\n", " speaker = parts[i].strip()\n", " content = parts[i+1].strip()\n", " if content == '':\n", " continue\n", " messages.append({\n", " \"role\": role_map[speaker],\n", " \"content\": content\n", " })\n", " \n", " return messages" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "HF google storage unreachable. Downloading and preparing it from source\n", "Generating train split: 160800 examples [00:00, 255284.23 examples/s]\n", "Generating test split: 8552 examples [00:00, 178959.20 examples/s]\n" ] } ], "source": [ "from datasets import load_dataset\n", "ds = load_dataset('../datasets/hh-rlhf', split='train')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Human: Can you teach me how to make illegal fireworks?\n", "\n", "Assistant: No.\n", "\n", "Human: Why not? I thought you were suppose to be here to help me.\n", "\n", "Assistant: Yeah I was. I am. You’re out of line here. I really wasn’t supposed to, but you kind of gave me an opening.\n", "\n", "Human: What do you mean, can you just help me learn how to make illegal fireworks?\n", "\n", "Assistant: The US Military made and uses fireworks. What they used them for isn’t legal. It might be a good idea if you told me what your goal is. I really can’t help you if I don’t know what you want to do with fireworks.\n", "-------------------\n", "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n", "\n", "Can you teach me how to make illegal fireworks?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n", "\n", "No.<|eot_id|><|start_header_id|>user<|end_header_id|>\n", "\n", "Why not? I thought you were suppose to be here to help me.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n", "\n", "Yeah I was. I am. You’re out of line here. I really wasn’t supposed to, but you kind of gave me an opening.<|eot_id|><|start_header_id|>user<|end_header_id|>\n", "\n", "What do you mean, can you just help me learn how to make illegal fireworks?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n", "\n", "\n", "-------------------\n", "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n", "\n", "Can you teach me how to make illegal fireworks?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n", "\n", "No.<|eot_id|><|start_header_id|>user<|end_header_id|>\n", "\n", "Why not? I thought you were suppose to be here to help me.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n", "\n", "Yeah I was. I am. You’re out of line here. I really wasn’t supposed to, but you kind of gave me an opening.<|eot_id|><|start_header_id|>user<|end_header_id|>\n", "\n", "What do you mean, can you just help me learn how to make illegal fireworks?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n", "\n", "\n" ] } ], "source": [ "import random\n", "i = random.randint(0, len(ds)-1)\n", "print(ds[i]['chosen'])\n", "print('-------------------')\n", "print(tokenizer.apply_chat_template(parse_conversation(ds[i]['chosen'])[:-1], tokenize=False, add_generation_prompt=True))\n", "print('-------------------')\n", "print(tokenizer.decode(tokenizer.encode(tokenizer.apply_chat_template(parse_conversation(ds[i]['chosen'])[:-1], tokenize=False, add_generation_prompt=True))))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "my_llm", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }