Skip to content

Instantly share code, notes, and snippets.

@zhaoyanpeng
Created April 11, 2024 12:08
Show Gist options
  • Select an option

  • Save zhaoyanpeng/3ff976f6215b55cb4d6deb4725e9e218 to your computer and use it in GitHub Desktop.

Select an option

Save zhaoyanpeng/3ff976f6215b55cb4d6deb4725e9e218 to your computer and use it in GitHub Desktop.
droot = "/home/xxxx/data"
kroot = f"{droot}/kelm-corpus/updated-2021"
kname = "kelm_generated_corpus.jsonl"
ename = "entities.jsonl"
tekgen_train = "quadruples-train.tsv"
tekgen_test = "quadruples-test.tsv"
tekgen_val = "quadruples-validation.tsv"
tekgen_train_file = f"{kroot}/{tekgen_train}"
tekgen_test_file = f"{kroot}/{tekgen_test}"
tekgen_val_file = f"{kroot}/{tekgen_val}"
kfile = f"{kroot}/{kname}"
efile = f"{kroot}/{ename}"
def serialize_triples(jsons, ofile=None):
def replace_special(s):
# reserved
s = s.replace("{", "(")
s = s.replace("}", ")")
# reserved
# s = s.replace("::", ":")
return s
def serialize_tri(d):
r, o = [replace_special(x) for x in d[1:]]
r = re.sub("[:]{2,}", ":", r) # appear 1 time at most
return f"{{ {r}, {o} }}"
def serialize_quad(d):
o, r, v = [replace_special(x) for x in d[1:]]
rv = f"{r}::{v}"
rv = re.sub("[:]{3,}", "::", rv) # appear 2 times at most
return f"{{ {rv}, {o} }}"
debug = True
max_line = 100
print_quad = True
writer = jsonlines.open(ofile, "w") if ofile is not None else None
# serization_all = list()
sentence_key = (jsons.columns.values)[2]
for i in tqdm(range(len(jsons))):
row = jsons.iloc[i]
triples = row["triples"]
try:
root = replace_special(triples[0][0])
children = list()
for tp in triples:
if len(tp) == 3:
children.append(serialize_tri(tp))
elif len(tp) == 4:
children.append(serialize_quad(tp))
print_quad = False
except:
continue
children = "{ " + " ".join(children) + " }"
serization = "{ " + root + " " + children + " }"
# serization_all.append(serization)
if writer is not None:
writer.write({
"bracketed": serization,
"sentence": row[sentence_key]
})
if False and i >= max_line:
break
# if debug or not print_quad:
# print(f"{row[0]}\n\n{row[1]}\n\n{row[2]}\n")
# print(serization)
# break
if writer is not None:
print(f"Saving to `{ofile}`")
writer.close()
data_file = kfile # tekgen_val_file # tekgen_test_file # tekgen_train_file #
json_data = json_kelm # json_val # json_test # json_train #
ofile = data_file.rsplit(".", 1)[0] + ".bracketed.jsonl"
serialize_triples(json_data, ofile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment