S-Dreamer commited on
Commit
1c34bfc
·
verified ·
1 Parent(s): b79ce4d

Update datasets_loader.py

Browse files
Files changed (1) hide show
  1. datasets_loader.py +5 -31
datasets_loader.py CHANGED
@@ -1,33 +1,7 @@
1
- # datasets_loader.py
2
 
3
- from datasets import load_dataset, DatasetDict
4
- from transformers import AutoTokenizer
5
 
6
- def load_threat_dataset(path: str, tokenizer_name="bert-base-chinese"):
7
- """
8
- Loads a dataset of Chinese cybercrime posts with labels.
9
- Expects columns: text, label
10
- """
11
- raw = load_dataset("csv", data_files=path)
12
-
13
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
14
-
15
- def tokenize(batch):
16
- return tokenizer(
17
- batch["text"],
18
- truncation=True,
19
- padding="max_length",
20
- max_length=256
21
- )
22
-
23
- tokenized = raw.map(tokenize, batched=True)
24
-
25
- return DatasetDict({
26
- "train": tokenized["train"],
27
- "test": tokenized["test"]
28
- })
29
-
30
-
31
- # Example:
32
- # ds = load_threat_dataset("dataset/threat_samples.csv")
33
- # print(ds["train"][0])
 
1
+ from datasets import load_dataset
2
 
3
+ def load_redteamer_mistral():
4
+ return load_dataset("romaingrx/red-teamer-mistral-", split="train")
5
 
6
+ def load_multi_class_redteaming():
7
+ return load_dataset("SummerSigh/Muti-Class-Redteaming", split="train")