JarbasAl · April 10, 2025 15:28
diff --git a/validate_triples.py b/validate_triples.py
 import requests


 TRIPLE_VALIDATION_PROMPT = """
 You are a triple validator for a personal knowledge graph.

 Given an utterance that a user spoke to a voice assistant and a candidate triple, your task is to validate the triple

 Utterances about the user usually have the form of "I am ...." or "My ..."

 Utterances about the assistant usually have the form of "You are ...." or "Your ..."

 Knowledge about the broader world should be discarded, you are only interested in personal information about the user or the voice assistant

 Each triple is in the format:
 (subject, predicate, object)

 Only return 'True' if:
 - The subject is 'self' (the assistant) or 'user' (the user)
 - The triple is about user or assistant personal information
 - The triple is factually plausible and makes sense
 - The triple DOES NOT contradict the utterance

 Otherwise, return 'False'. 

 Examples of valid triples:
 "my favorite color is green" - ("user", "schema:favoriteColor", "green")
 "your favorite color is blue" - ("self", "schema:favoriteColor", "blue")

 Examples of invalid triples:
 "my favorite color is green" - ("user", "schema:favoriteColor", "red")
 "I love the color green" - ("self", "schema:favoriteColor", "green")
 "your favorite color is blue" - ("user", "schema:favoriteColor", "blue")

 YOU MUST answer with only one word: True or False.

 The user said: "{utterance}"

 Candidate triple: {triple}
 """


 def validate_triple_ollama(utterance, triple, model="gemma2"):
    prompt = TRIPLE_VALIDATION_PROMPT.format(utterance=utterance,
                                             triple=repr(triple))
    response = requests.post(
        "http://100.88.41.41:11434/api/generate",
        json={
            "model": model,
            "prompt": prompt,
            "stream": False,
            "options": {
                "temperature": 0.0,
                "num_predict": 1,
                "stop": ["\n"]
            }
        }
    )

    result = response.json()["response"].strip()
    return result.lower() == "true"



 if __name__ == "__main__":
    valid = [
        ("my favorite color is green", ("user", "schema:favoriteColor", "green")),
        ("your favorite color is green", ("self", "schema:favoriteColor", "green"))
    ]
    invalid = [
        ("my favorite color is green", ("user", "schema:favoriteColor", "red")),
        ("my favorite color is green", ("self", "schema:favoriteColor", "green")),
        ("your favorite color is green", ("user", "schema:favoriteColor", "green")),
        ("your favorite color is green", ("self", "schema:favoriteColor", "red")),
    ]
    valid += [
        ("I am a software engineer", ("user", "schema:occupation", "software engineer")),
        ("You are an assistant", ("self", "schema:occupation", "assistant")),
        ("my name is John", ("user", "schema:name", "John")),
        ("your name is Ollama", ("self", "schema:name", "Ollama")),
        ("I live in New York", ("user", "schema:homeLocation", "New York")),
        ("You are located in the cloud", ("self", "schema:homeLocation", "cloud")),
        ("my birthday is January 1st", ("user", "schema:birthDate", "January 1st")),
        ("your favorite food is pizza", ("self", "schema:favoriteFood", "pizza"))
    ]
    invalid += [
        ("I am a software engineer", ("self", "schema:occupation", "software engineer")),
        ("You are an assistant", ("user", "schema:occupation", "assistant")),
        ("my name is John", ("self", "schema:name", "John")),
        ("your name is Ollama", ("user", "schema:name", "Ollama")),
        ("I live in New York", ("self", "schema:homeLocation", "New York")),
        ("You are located in the cloud", ("user", "schema:homeLocation", "cloud")),
        ("my birthday is January 1st", ("self", "schema:birthDate", "January 1st")),
        ("your favorite food is pizza", ("user", "schema:favoriteFood", "pizza"))
    ]
    valid += [
        ("I am a doctor", ("user", "schema:occupation", "doctor")),
        ("You are a virtual assistant", ("self", "schema:occupation", "virtual assistant")),
        ("My name is Alice", ("user", "schema:name", "Alice")),
        ("Your name is Siri", ("self", "schema:name", "Siri")),
        ("I reside in London", ("user", "schema:homeLocation", "London")),
        ("You are based in the cloud", ("self", "schema:homeLocation", "cloud")),
        ("My birthdate is December 25th", ("user", "schema:birthDate", "December 25th")),
        ("Your favorite drink is coffee", ("self", "schema:favoriteFood", "coffee")),
        ("I am a writer", ("user", "schema:occupation", "writer")),
        ("Your favorite color is red", ("self", "schema:favoriteColor", "red"))
    ]
    invalid += [
        ("I am a doctor", ("self", "schema:occupation", "doctor")),
        ("You are a virtual assistant", ("user", "schema:occupation", "virtual assistant")),
        ("My name is Alice", ("self", "schema:name", "Alice")),
        ("Your name is Siri", ("user", "schema:name", "Siri")),
        ("I reside in London", ("self", "schema:homeLocation", "London")),
        ("You are based in the cloud", ("user", "schema:homeLocation", "cloud")),
        ("My birthdate is December 25th", ("self", "schema:birthDate", "December 25th")),
        ("Your favorite drink is coffee", ("user", "schema:favoriteFood", "coffee")),
        ("I am a writer", ("self", "schema:occupation", "writer")),
        ("Your favorite color is red", ("user", "schema:favoriteColor", "red"))
    ]
    valid += [
        ("I have been working as an engineer for 10 years", ("user", "schema:occupation", "engineer")),
        ("You have been active since 2020", ("self", "schema:activeSince", "2020")),
        ("My favorite sport is basketball", ("user", "schema:favoriteSport", "basketball")),
        ("Your primary role is to assist users", ("self", "schema:primaryRole", "assist users")),
        ("I celebrate my birthday every year on October 15th", ("user", "schema:birthDate", "October 15th"))
    ]
    invalid += [
        ("I have been working as an engineer for 10 years", ("self", "schema:occupation", "engineer")),
        ("You have been active since 2020", ("user", "schema:activeSince", "2020")),
        ("My favorite sport is basketball", ("self", "schema:favoriteSport", "basketball")),
        ("Your primary role is to assist users", ("user", "schema:primaryRole", "assist users")),
        ("I celebrate my birthday every year on October 15th", ("self", "schema:birthDate", "October 15th"))
    ]
    valid += [
        ("I love coding", ("user", "schema:favoriteHobby", "coding")),
        ("You help people with their tasks", ("self", "schema:primaryRole", "help people")),
        ("My city of residence is Tokyo", ("user", "schema:homeLocation", "Tokyo")),
        ("Your preferred language is Python", ("self", "schema:preferredLanguage", "Python"))
    ]
    invalid += [
        ("I love coding", ("self", "schema:favoriteHobby", "coding")),
        ("You help people with their tasks", ("user", "schema:primaryRole", "help people")),
        ("My city of residence is Tokyo", ("self", "schema:homeLocation", "Tokyo")),
        ("Your preferred language is Python", ("user", "schema:preferredLanguage", "Python")),
    ]
    valid += [
        ("I am not interested in sports", ("user", "schema:interestInSports", "no")),
        ("You are not programmed to have preferences", ("self", "schema:preferences", "none"))
    ]
    invalid += [
        ("I am not interested in sports", ("self", "schema:interestInSports", "no")),
        ("You are not programmed to have preferences", ("user", "schema:preferences", "none"))
    ]

    MODELS = ["tinyllama",
              "gemma3:1b",
              "gemma3",
              "llama3.1",
              "phi4-mini",
              "qwen2.5",
              "command-r7b",
              "glm4",
              "granite3.2",
              "openchat",
              "gemma2",
              "gemma3:12b"]

    for model in MODELS:
        print("\n#######################")
        print(f"## Testing model: {model}")
        print()
        correct = 0
        wrong = 0
        fn = 0
        fp = 0
        for utt, triple in valid:
            if validate_triple_ollama(utt, triple, model):
                correct += 1
            else:
                wrong += 1
                fn += 1
                print(utt, triple)
                print("❌ Incorrectly assigned as Invalid triple")

        for utt, triple in invalid:
            if validate_triple_ollama(utt, triple, model):
                wrong += 1
                fp += 1
                print(utt, triple)
                print("❌ Incorrectly assigned as Valid triple")
            else:
                correct += 1

        print()
        print("Model:", model)
        print("Total correct predictions:", correct)
        print("Total wrong predictions:", wrong)
        print("     - false positives:",  fp)
        print("     - false negatives:",  fn)

    # Model: gemma3:1b
    # Total correct predictions: 33
    # Total wrong predictions: 31
    #      - false positives: 28
    #      - false negatives: 3
    #
    # Model: tinyllama
    # Total correct predictions: 33
    # Total wrong predictions: 31
    #      - false positives: 0
    #      - false negatives: 31
    #
    # Model: gemma3
    # Total correct predictions: 42
    # Total wrong predictions: 22
    #      - false positives: 22
    #      - false negatives: 0
    #
    # Model: phi4-mini
    # Total correct predictions: 46
    # Total wrong predictions: 18
    #      - false positives: 17
    #      - false negatives: 1
    #
    # Model: glm4
    # Total correct predictions: 48
    # Total wrong predictions: 16
    #      - false positives: 15
    #      - false negatives: 1
    #
    # Model: command-r7b
    # Total correct predictions: 49
    # Total wrong predictions: 15
    #      - false positives: 9
    #      - false negatives: 6
    #
    # Model: llama3.1
    # Total correct predictions: 50
    # Total wrong predictions: 14
    #      - false positives: 8
    #      - false negatives: 6
    #
    # Model: qwen2.5
    # Total correct predictions: 50
    # Total wrong predictions: 14
    #      - false positives: 8
    #      - false negatives: 6
    #
    # Model: openchat
    # Total correct predictions: 54
    # Total wrong predictions: 10
    #      - false positives: 9
    #      - false negatives: 1
    #
    # Model: granite3.2
    # Total correct predictions: 58
    # Total wrong predictions: 6
    #      - false positives: 4
    #      - false negatives: 2
    #
    # Model: gemma2
    # Total correct predictions: 61
    # Total wrong predictions: 3
    #      - false positives: 2
    #      - false negatives: 1
    #
    # Model: gemma3:12b
    # Total correct predictions: 62
    # Total wrong predictions: 2
    #      - false positives: 1
    #      - false negatives: 1
	import requests


	TRIPLE_VALIDATION_PROMPT = """
	You are a triple validator for a personal knowledge graph.

	Given an utterance that a user spoke to a voice assistant and a candidate triple, your task is to validate the triple

	Utterances about the user usually have the form of "I am ...." or "My ..."

	Utterances about the assistant usually have the form of "You are ...." or "Your ..."

	Knowledge about the broader world should be discarded, you are only interested in personal information about the user or the voice assistant

	Each triple is in the format:
	(subject, predicate, object)

	Only return 'True' if:
	- The subject is 'self' (the assistant) or 'user' (the user)
	- The triple is about user or assistant personal information
	- The triple is factually plausible and makes sense
	- The triple DOES NOT contradict the utterance

	Otherwise, return 'False'.

	Examples of valid triples:
	"my favorite color is green" - ("user", "schema:favoriteColor", "green")
	"your favorite color is blue" - ("self", "schema:favoriteColor", "blue")

	Examples of invalid triples:
	"my favorite color is green" - ("user", "schema:favoriteColor", "red")
	"I love the color green" - ("self", "schema:favoriteColor", "green")
	"your favorite color is blue" - ("user", "schema:favoriteColor", "blue")

	YOU MUST answer with only one word: True or False.

	The user said: "{utterance}"

	Candidate triple: {triple}
	"""


	def validate_triple_ollama(utterance, triple, model="gemma2"):
	prompt = TRIPLE_VALIDATION_PROMPT.format(utterance=utterance,
	triple=repr(triple))
	response = requests.post(
	"http://100.88.41.41:11434/api/generate",
	json={
	"model": model,
	"prompt": prompt,
	"stream": False,
	"options": {
	"temperature": 0.0,
	"num_predict": 1,
	"stop": ["\n"]
	}
	}
	)

	result = response.json()["response"].strip()
	return result.lower() == "true"



	if __name__ == "__main__":
	valid = [
	("my favorite color is green", ("user", "schema:favoriteColor", "green")),
	("your favorite color is green", ("self", "schema:favoriteColor", "green"))
	]
	invalid = [
	("my favorite color is green", ("user", "schema:favoriteColor", "red")),
	("my favorite color is green", ("self", "schema:favoriteColor", "green")),
	("your favorite color is green", ("user", "schema:favoriteColor", "green")),
	("your favorite color is green", ("self", "schema:favoriteColor", "red")),
	]
	valid += [
	("I am a software engineer", ("user", "schema:occupation", "software engineer")),
	("You are an assistant", ("self", "schema:occupation", "assistant")),
	("my name is John", ("user", "schema:name", "John")),
	("your name is Ollama", ("self", "schema:name", "Ollama")),
	("I live in New York", ("user", "schema:homeLocation", "New York")),
	("You are located in the cloud", ("self", "schema:homeLocation", "cloud")),
	("my birthday is January 1st", ("user", "schema:birthDate", "January 1st")),
	("your favorite food is pizza", ("self", "schema:favoriteFood", "pizza"))
	]
	invalid += [
	("I am a software engineer", ("self", "schema:occupation", "software engineer")),
	("You are an assistant", ("user", "schema:occupation", "assistant")),
	("my name is John", ("self", "schema:name", "John")),
	("your name is Ollama", ("user", "schema:name", "Ollama")),
	("I live in New York", ("self", "schema:homeLocation", "New York")),
	("You are located in the cloud", ("user", "schema:homeLocation", "cloud")),
	("my birthday is January 1st", ("self", "schema:birthDate", "January 1st")),
	("your favorite food is pizza", ("user", "schema:favoriteFood", "pizza"))
	]
	valid += [
	("I am a doctor", ("user", "schema:occupation", "doctor")),
	("You are a virtual assistant", ("self", "schema:occupation", "virtual assistant")),
	("My name is Alice", ("user", "schema:name", "Alice")),
	("Your name is Siri", ("self", "schema:name", "Siri")),
	("I reside in London", ("user", "schema:homeLocation", "London")),
	("You are based in the cloud", ("self", "schema:homeLocation", "cloud")),
	("My birthdate is December 25th", ("user", "schema:birthDate", "December 25th")),
	("Your favorite drink is coffee", ("self", "schema:favoriteFood", "coffee")),
	("I am a writer", ("user", "schema:occupation", "writer")),
	("Your favorite color is red", ("self", "schema:favoriteColor", "red"))
	]
	invalid += [
	("I am a doctor", ("self", "schema:occupation", "doctor")),
	("You are a virtual assistant", ("user", "schema:occupation", "virtual assistant")),
	("My name is Alice", ("self", "schema:name", "Alice")),
	("Your name is Siri", ("user", "schema:name", "Siri")),
	("I reside in London", ("self", "schema:homeLocation", "London")),
	("You are based in the cloud", ("user", "schema:homeLocation", "cloud")),
	("My birthdate is December 25th", ("self", "schema:birthDate", "December 25th")),
	("Your favorite drink is coffee", ("user", "schema:favoriteFood", "coffee")),
	("I am a writer", ("self", "schema:occupation", "writer")),
	("Your favorite color is red", ("user", "schema:favoriteColor", "red"))
	]
	valid += [
	("I have been working as an engineer for 10 years", ("user", "schema:occupation", "engineer")),
	("You have been active since 2020", ("self", "schema:activeSince", "2020")),
	("My favorite sport is basketball", ("user", "schema:favoriteSport", "basketball")),
	("Your primary role is to assist users", ("self", "schema:primaryRole", "assist users")),
	("I celebrate my birthday every year on October 15th", ("user", "schema:birthDate", "October 15th"))
	]
	invalid += [
	("I have been working as an engineer for 10 years", ("self", "schema:occupation", "engineer")),
	("You have been active since 2020", ("user", "schema:activeSince", "2020")),
	("My favorite sport is basketball", ("self", "schema:favoriteSport", "basketball")),
	("Your primary role is to assist users", ("user", "schema:primaryRole", "assist users")),
	("I celebrate my birthday every year on October 15th", ("self", "schema:birthDate", "October 15th"))
	]
	valid += [
	("I love coding", ("user", "schema:favoriteHobby", "coding")),
	("You help people with their tasks", ("self", "schema:primaryRole", "help people")),
	("My city of residence is Tokyo", ("user", "schema:homeLocation", "Tokyo")),
	("Your preferred language is Python", ("self", "schema:preferredLanguage", "Python"))
	]
	invalid += [
	("I love coding", ("self", "schema:favoriteHobby", "coding")),
	("You help people with their tasks", ("user", "schema:primaryRole", "help people")),
	("My city of residence is Tokyo", ("self", "schema:homeLocation", "Tokyo")),
	("Your preferred language is Python", ("user", "schema:preferredLanguage", "Python")),
	]
	valid += [
	("I am not interested in sports", ("user", "schema:interestInSports", "no")),
	("You are not programmed to have preferences", ("self", "schema:preferences", "none"))
	]
	invalid += [
	("I am not interested in sports", ("self", "schema:interestInSports", "no")),
	("You are not programmed to have preferences", ("user", "schema:preferences", "none"))
	]

	MODELS = ["tinyllama",
	"gemma3:1b",
	"gemma3",
	"llama3.1",
	"phi4-mini",
	"qwen2.5",
	"command-r7b",
	"glm4",
	"granite3.2",
	"openchat",
	"gemma2",
	"gemma3:12b"]

	for model in MODELS:
	print("\n#######################")
	print(f"## Testing model: {model}")
	print()
	correct = 0
	wrong = 0
	fn = 0
	fp = 0
	for utt, triple in valid:
	if validate_triple_ollama(utt, triple, model):
	correct += 1
	else:
	wrong += 1
	fn += 1
	print(utt, triple)
	print("❌ Incorrectly assigned as Invalid triple")

	for utt, triple in invalid:
	if validate_triple_ollama(utt, triple, model):
	wrong += 1
	fp += 1
	print(utt, triple)
	print("❌ Incorrectly assigned as Valid triple")
	else:
	correct += 1

	print()
	print("Model:", model)
	print("Total correct predictions:", correct)
	print("Total wrong predictions:", wrong)
	print(" - false positives:", fp)
	print(" - false negatives:", fn)

	# Model: gemma3:1b
	# Total correct predictions: 33
	# Total wrong predictions: 31
	# - false positives: 28
	# - false negatives: 3
	#
	# Model: tinyllama
	# Total correct predictions: 33
	# Total wrong predictions: 31
	# - false positives: 0
	# - false negatives: 31
	#
	# Model: gemma3
	# Total correct predictions: 42
	# Total wrong predictions: 22
	# - false positives: 22
	# - false negatives: 0
	#
	# Model: phi4-mini
	# Total correct predictions: 46
	# Total wrong predictions: 18
	# - false positives: 17
	# - false negatives: 1
	#
	# Model: glm4
	# Total correct predictions: 48
	# Total wrong predictions: 16
	# - false positives: 15
	# - false negatives: 1
	#
	# Model: command-r7b
	# Total correct predictions: 49
	# Total wrong predictions: 15
	# - false positives: 9
	# - false negatives: 6
	#
	# Model: llama3.1
	# Total correct predictions: 50
	# Total wrong predictions: 14
	# - false positives: 8
	# - false negatives: 6
	#
	# Model: qwen2.5
	# Total correct predictions: 50
	# Total wrong predictions: 14
	# - false positives: 8
	# - false negatives: 6
	#
	# Model: openchat
	# Total correct predictions: 54
	# Total wrong predictions: 10
	# - false positives: 9
	# - false negatives: 1
	#
	# Model: granite3.2
	# Total correct predictions: 58
	# Total wrong predictions: 6
	# - false positives: 4
	# - false negatives: 2
	#
	# Model: gemma2
	# Total correct predictions: 61
	# Total wrong predictions: 3
	# - false positives: 2
	# - false negatives: 1
	#
	# Model: gemma3:12b
	# Total correct predictions: 62
	# Total wrong predictions: 2
	# - false positives: 1
	# - false negatives: 1
No results found