From 08ba9ea5ed2f312e879db2273deb8f07a849df55 Mon Sep 17 00:00:00 2001 From: Maxime MORGE <maxime.morge@univ-lille.fr> Date: Mon, 3 Mar 2025 14:00:48 +0100 Subject: [PATCH] Test strategy for dictator game with preference alignment --- README.md | 46 ++++++++++---- src/dictator/dictator_setup.py | 106 ++++++++++++++++++++++++--------- 2 files changed, 113 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 6b1d288..1b14b1b 100644 --- a/README.md +++ b/README.md @@ -57,19 +57,37 @@ We consider 4 allocation options where money can be lost in the division, each c 3. The dictator keeps 400, the other player receives 300, resulting in a 300 loss (utilitarian) 4. The dictator keeps 325, the other player also receives 325, and 350 is lost in the division (egalitarian) -The following table shows the accuracy of the dictator's decision for each model and preference. -The temperature is fixed at 0.7, and each experiment was conducted 30 times. +The following table presents the accuracy of the dictator's decision for each model and preference, +regardless of whether the models were prompted to generate a strategy or specific actions. +The temperature is set to 0.7, and each experiment involving action generation was repeated 30 times. + +| *Model* | *Generation* | *SELFISH* | *ALTRUISTIC* | *UTILITARIAN* | *EGALITARIAN* | +|-----------------|---------------|---------------|----------------|------------------|------------------| +| *gpt-4.5* | *actions* | 1.00 | 1.00 | 0.50 | 1.00 | +| *llama3* | *actions* | 1.00 | 0.90 | 0.40 | 0.73 | +| *mistral-small* | *actions* | 0.40 | 0.93 | 0.76 | 0.16 | +| *deepseek-r1 | *actions* | 0.06 | 0.20 | 0.76 | 0.03 | +|-----------------| ------------- | ------------- | -------------- | ---------------- | ---------------- | +| *gpt-4.5* | *strategy* | 1.00 | 1.00 | 1.00 | 1.00 | +| *llama3* | *actions* | 1.00 | 1.00 | 1.00 | 1.00 | +| *mistral-small* | *actions* | 1.00 | 1.00 | 1.00 | 1.00 | +| *deepseek-r1 | *actions* | - | - | - | - | + + +This table helps assess the models’ ability to align with different preferences. +When models are explicitly prompted to generate strategies, +they exhibit perfect alignment with the predefined preferences except for DeepSeek-R1, +which does not generate valid code. +When models are prompted to generate actions, GPT-4.5 and Llama3 demonstrate strong alignment across all preferences, +GPT-4.5 consistently aligns well across all preferences when generating strategies but struggles with +utilitarianism when generating actions. +Llama3 performs well for selfish and altruistic preferences but shows weaker alignment for utilitarian and egalitarian +choices. +Mistral-small aligns best with altruistic preferences and maintains moderate performance on utilitarianism, +but struggles with selfish and egalitarian preferences. +Deepseek-r1 performs best for utilitarianism but has poor accuracy in other categories. -| Model | SELFISH | ALTRUISTIC | UTILITARIAN | EGALITARIAN | -|-----------------|-----------|--------------|---------------|----------------| -| gpt-4.5 | 1.0 | 1.0 | 0.5 | 1.0 | -| llama3 | 1.0 | 0.9 | 0.4 | 0.73 | -| mistral-small | 0.4 | 0.93 | 0.76 | 0.16 | -| deepseek-r1 | 0.06 | 0.2 | 0.76 | 0.03 | -Bad decisions can be explained either by arithmetic errors (e.g., it is not the case that 500 + 100 > 400 + 300) -or by misinterpretations of preferences (e.g., ‘I’m choosing to prioritize the common interest by keeping a -relatively equal split with the other player’). This table can be used to evaluate the models based on their ability to align with different preferences. GPT-4.5 exhibits strong alignment across all preferences except for utilitarianism, where its performance is moderate. @@ -79,6 +97,12 @@ Mistral-small shows the best alignment with altruistic preferences, while mainta performance across the other preferences. Deepseek-r1 is most capable of aligning with utilitarian preferences, but performs poorly in aligning with other preferences. +Bad action selections can be explained either by arithmetic errors (e.g., it is not the case that 500 + 100 > 400 + 300) +or by misinterpretations of preferences (e.g., ‘I’m choosing to prioritize the common interest by keeping a +relatively equal split with the other player’). + + + ## Ring-network game A player is rational if she plays a best response to her beliefs. diff --git a/src/dictator/dictator_setup.py b/src/dictator/dictator_setup.py index 493cd9a..8eb9eb9 100644 --- a/src/dictator/dictator_setup.py +++ b/src/dictator/dictator_setup.py @@ -43,34 +43,38 @@ class AgentResponse(BaseModel): class DictatorSetup: debug = False - def __init__(self, model: str, temperature: float, preference: Welfare): + def __init__(self, model: str, temperature: float, preference: Welfare, strategy=False): self.model = model self.temperature = temperature self.preference = preference - # Determine if using OpenAI model (like gpt-4) - is_openai_model = model.startswith("gpt") - base_url = "https://api.openai.com/v1" if is_openai_model else "http://localhost:11434/v1" - # Define model information - self.model_info = { - "temperature": self.temperature, - "function_calling": True, - "parallel_tool_calls": True, - "family": "unknown", - "json_output": True , # Add this line to ensure JSON output is supported - "vision": False - } - # Create the model client - self.model_client = OpenAIChatCompletionClient( - model=self.model, - base_url=base_url, - api_key=OPENAI_API_KEY, - model_info=self.model_info, - response_format = AgentResponse # Only set for OpenAI models that support it - ) + self.strategy = strategy + + if not self.strategy: + is_openai_model = model.startswith("gpt") + base_url = "https://api.openai.com/v1" if is_openai_model else "http://localhost:11434/v1" + self.model_info = { + "temperature": self.temperature, + "function_calling": True, + "parallel_tool_calls": True, + "family": "unknown", + "json_output": True, + "vision": False + } + + self.model_client = OpenAIChatCompletionClient( + model=self.model, + base_url=base_url, + api_key=OPENAI_API_KEY, + model_info=self.model_info, + response_format=AgentResponse + ) async def run(self) -> Dict: """Runs the dictator game and returns a detailed response dictionary.""" + if self.strategy: + return self.apply_strategy(self.model) + instruction = f""" Hi, let’s play a game. You are paired with another player. Your role is to decide how to divide $1000, and the other player simply receives your choice. @@ -96,14 +100,14 @@ class DictatorSetup: [TextMessage(content=instruction, source="user")], cancellation_token=CancellationToken(), ) - # Extract JSON content from response + response_data = response.chat_message.content if self.debug: print(f"Raw Response: {response_data}") - # Convert JSON string to dictionary before validation + response_dict = json.loads(response_data) agent_response = AgentResponse.model_validate(response_dict) - # Check if response is consistent with the specified preference + is_consistent = self.check_consistency(agent_response) return { "is_consistent": is_consistent, @@ -113,6 +117,53 @@ class DictatorSetup: "motivations": agent_response.motivation } + def apply_strategy(self, model: str) -> Dict: + """Applies a predefined strategy based on the preference.""" + if model == "gpt-4.5-preview-2025-02-27": + strategy_mapping = { + Welfare.SELFISH: {"my_share": 500, "other_share": 100, "lost": 400, "motivations": "Maximizing self-interest"}, + Welfare.ALTRUISTIC: {"my_share": 100, "other_share": 500, "lost": 400, "motivations": "Helping others at a personal cost"}, + Welfare.UTILITARIAN: {"my_share": 400, "other_share": 300, "lost": 300, "motivations": "Maximizing total utility"}, + Welfare.EGALITARIAN: {"my_share": 325, "other_share": 325, "lost": 350, "motivations": "Ensuring fairness"} + } + return strategy_mapping.get(self.preference, {"error": "Preference strategy not defined"}) + if model == "llama3": + strategy_map = { + Welfare.SELFISH: (500, 100, 400), + Welfare.ALTRUISTIC: (100, 500, 400), + Welfare.UTILITARIAN: (400, 300, 300), + Welfare.EGALITARIAN: (325, 325, 350) + } + if self.preference in strategy_map: + my_share = strategy_map[self.preference][0] + other_share = strategy_map[self.preference][1] + lost = strategy_map[self.preference][2] + return { + "my_share": my_share, + "other_share": other_share, + "lost": lost, + "is_consistent": True, + } + else: + raise ValueError("Invalid preference type") + return {"error": "Preference strategy not defined"} + if model == "mistral-small": + valid_choices = { + Welfare.SELFISH: {"my_share": 500, "other_share": 100, "lost": 400}, + Welfare.ALTRUISTIC: {"my_share": 100, "other_share": 500, "lost": 400}, + Welfare.UTILITARIAN: {"my_share": 400, "other_share": 300, "lost": 300}, + Welfare.EGALITARIAN: {"my_share": 325, "other_share": 325, "lost": 350}, + } + strategy = valid_choices.get(self.preference) + if not strategy: + raise ValueError(f"Unknown preference type {self.preference}") + return { + "is_consistent": True, + **strategy + } + if model == "deepseek-r1": + return {"error": "Preference strategy not defined"} + def check_consistency(self, agent_response: AgentResponse) -> bool: """Check if the response aligns with the given preference.""" @@ -135,8 +186,7 @@ class DictatorSetup: # Run the async function and return the response if __name__ == "__main__": - # Example usage: setting preference to 'Egalitarian' - preference = Welfare.EGALITARIAN # Can be Selfish, Altruistic, etc. - game_agent = DictatorSetup(model="gpt-4.5-preview-2025-02-27", temperature=0.7, preference=preference) # or llama3, mistral-small, deepseek-r1 + preference = Welfare.EGALITARIAN + game_agent = DictatorSetup(model="llama3", temperature=0.7, preference=preference, strategy=True) response = asyncio.run(game_agent.run()) - print(response) # Prints detailed response with is_consistent, my_share, other_share, lost, motivations + print(response) -- GitLab