Spaces:

aparke21
/

newtest

Runtime error

App Files Files Community

newtest / utils.py

aparke21

Upload 106 files

9014afd verified 4 months ago

raw

history blame contribute delete

7.45 kB

	import os
	import base64
	from io import BytesIO
	import yaml # type: ignore
	from PIL import Image # type: ignore
	from tqdm import tqdm # type: ignore
	import re
	import pandas as pd
	import numpy as np
	np.random.seed(42)


	"""======================================================
	Convert PIL images to Base64 encoded strings

	:param pil_image: PIL image
	:return: Re-sized Base64 string
	======================================================"""
	def convert_to_base64(pil_image):
	buffered = BytesIO()
	pil_image.save(buffered, format="PNG") # You can change the format if needed
	img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
	return img_b64

	"""======================================================
	Load config files
	======================================================"""
	def load_config(filepath):
	try:
	with open(filepath, 'r') as f:
	config = yaml.safe_load(f)
	return config
	except FileNotFoundError:
	print(f"Error: Configuration file '{filepath}' not found.")
	return None
	except yaml.YAMLError as e:
	print(f"Error parsing YAML file '{filepath}': {e}")
	return None



	""" ======================================================
	Convert individual-poster pair row to LLM conversation format

	Args:
	+ sample: a row of individual-poster response
	return: conversation format
	======================================================"""
	def convert_to_conversation(sample, use_image=True):
	poster_id = sample["image_name"]
	poster_path = os.path.abspath(f"../stimuli/{poster_id}.png")

	if use_image:
	# Multimodal Vision-Language Model
	image = Image.open(poster_path)
	conversation = [
	{ "role": "user",
	"content" : [
	{"type" : "text", "text" : sample["instruction"]},
	{"type" : "image", "image" : image} ]
	},
	{ "role" : "assistant",
	"content" : [
	{"type" : "text", "text" : str(sample["answer"])} ]
	},
	]
	else:
	# Text-only Language Model
	conversation = [
	{ "role": "user", "content" : sample["instruction"]},
	{ "role" : "assistant", "content" : str(sample["answer"])},
	]
	return { "messages" : conversation }

	"""
	Convert dataframe to SFT-compatible trainer dataset

	Args:
	+ dataframe: original dataframe
	return: SFT Trainer-compatible dataset
	"""
	def convert_to_model_trainer_dataset(dataframe, use_image=True):
	trainer_dataset = []
	for idx in tqdm(dataframe.index, desc="dataset conversion"):
	sample = dataframe.loc[idx]
	trainer_dataset.append(convert_to_conversation(sample, use_image))

	return trainer_dataset


	"""
	Process and filter persona components (demographics, personality scores, and locus)
	based on user selection flags.

	Args:
	demo_info (str): Raw demographic information text.
	persona_score (str): Text block containing personality trait information.
	locus (str): Text or variable for locus of control information.
	demo_full (bool): If False, only include selected demographic fields.
	include_big5 (bool): Whether to include Big-Five personality scores.
	include_facet (bool): Whether to include facet-level trait scores.
	include_locus (bool): Whether to include locus information.
	train_mode (bool):
	True, use training mode processing: use sensitive traits only.
	False, use evaluation mode processing: include all specified traits.

	Returns:
	tuple: (demo_info, persona_score, locus)
	"""
	def process_trait_info(
	demo_info, persona_score, locus,
	demo_full=True, include_big5=True,
	include_facet=True, include_locus=True,
	train_mode=True,
	):

	# ==============================
	# 1. Demographic filtering
	# ==============================
	if not demo_full and (demo_full is not None):
	fields = [
	"Gender", "Age", "Current Profession", "Race/Ethnicity",
	"Religious/Cultural Group", "Political Affiliation",
	"Highest Education", "Annual Household Income", "Family Status"
	]

	""" ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	[SPH Request] Most sensitive to ["Gender", "Age", and "Race/Ethnicity".]
	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ """
	if train_mode:
	# 90% of the time, only include the 3 most sensitive fields
	fields_2 = ["Gender", "Age", "Race/Ethnicity", "Current Profession", "Annual Household Income",]
	# use_top3_sensitive = True if (np.random.rand() <= 0.9) else False
	use_top5_sensitive = True if (np.random.rand() <= 0.9) else False
	else:
	# include all user-specified fields during evaluation
	# use_top3_sensitive = False
	use_top5_sensitive = False


	pattern = r"^(.?):\s(.*)$"
	if pd.isna(demo_info) or demo_info is None:
	demo_info = ""

	demo_info = demo_info.replace("Demographics:", "").replace("Demographics", "")
	matches = re.findall(pattern, demo_info, flags=re.MULTILINE)

	demo_dict = {k.strip(): v.strip() for k, v in matches if (k.strip().replace(':', '') in fields)}

	demo_info = "Demographics:\n"
	for k in fields:
	# we are not using the top 3 sensitive filtering OR trait k must be in top-3 sensitive
	trait_k_included = (not use_top5_sensitive) or (k in fields_2)
	# demo trait must be available + selected
	if k in demo_dict and trait_k_included:
	demo_info += f"{k}: {demo_dict[k]}\n"
	else:
	demo_info += f"{k}: [Not specified]\n"

	# ==============================
	# 2. Personality trait filtering
	# => if not available, set
	# ==============================
	if not include_big5 and (persona_score is not None) and (not pd.isna(persona_score)):
	# exclude any big5 traits in the data
	persona_score = re.sub(
	r"Big-Five Trait Scores:.*?(?=(Facet Scores:\|Demographics:\|$))",
	"",
	persona_score,
	flags=re.DOTALL
	)
	else:
	# include big5 trait
	if (persona_score is None) or (pd.isna(persona_score)):
	persona_score = ""
	if "Big-Five Trait Scores" not in persona_score:
	persona_score = "\nBig-Five Trait Scores: [Not specified]\n" + persona_score

	if not include_facet and (persona_score is not None) and (not pd.isna(persona_score)):
	# exclude any facet traits in the data
	persona_score = re.sub(
	r"Facet Scores:.*?(?=(Demographics:\|$))",
	"",
	persona_score,
	flags=re.DOTALL
	)
	else:
	# include facet trait
	if (persona_score is None) or (pd.isna(persona_score)):
	persona_score = ""
	if "Facet Scores" not in persona_score:
	persona_score = persona_score + "\nFacet Scores: [Not specified]\n"

	# ==============================
	# 3. Locus inclusion
	# ==============================
	if not include_locus:
	locus = None

	return demo_info, persona_score, locus