Skip to content

Instantly share code, notes, and snippets.

@arrufat
Last active June 11, 2024 23:37
Show Gist options
  • Select an option

  • Save arrufat/23eb99a5635c2e1671e7fdb6f2e017ef to your computer and use it in GitHub Desktop.

Select an option

Save arrufat/23eb99a5635c2e1671e7fdb6f2e017ef to your computer and use it in GitHub Desktop.
from sys import argv, exit
import json
if len(argv) < 2:
exit()
with open("out.jsonl", "w") as out:
for arg in argv[1:]:
print(arg)
with open(arg, "r") as jsonl:
for line in jsonl:
root = json.loads(line)
if "title" in root:
title: str = root["title"]
if not title.endswith("?"):
continue
if title == "[deleted by user]":
continue
pmi = {
"title": title,
"subreddit": root["subreddit"],
"score": root["score"],
"num_comments": root["num_comments"],
"created_utc": root["created_utc"],
"text": root["selftext"],
"media": root["media"] is not None,
}
out.write(json.dumps(pmi))
out.write("\n")
const std = @import("std");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
var args = try std.process.argsAlloc(allocator);
defer std.process.argsFree(allocator, args);
if (args.len < 2) return;
var line = std.ArrayList(u8).init(allocator);
defer line.deinit();
var out_file = try std.fs.cwd().createFile("output.jsonl", .{ .read = false });
defer out_file.close();
var buffered_writer = std.io.bufferedWriter(out_file.writer());
const writer = buffered_writer.writer();
for (args[1..]) |file_path| {
std.log.debug("file_path: {s}\n", .{file_path});
const file = try std.fs.cwd().openFile(file_path, .{ .mode = .read_only });
defer file.close();
var buffered_reader = std.io.bufferedReader(file.reader());
const reader = buffered_reader.reader();
while (true) {
defer line.clearRetainingCapacity();
reader.streamUntilDelimiter(line.writer(), '\n', null) catch |err| switch (err) {
error.EndOfStream => break,
else => return err,
};
const root = try std.json.parseFromSlice(std.json.Value, allocator, line.items, .{});
defer root.deinit();
switch (root.value) {
.object => |object| {
if (object.get("title")) |title| {
if (std.mem.eql(u8, title.string, "[deleted by user]")) continue;
if (!std.mem.endsWith(u8, title.string, "?")) continue;
const pmi = PostMetaInfo{
.title = title.string,
.subreddit = if (object.get("subreddit")) |value| value.string else "",
.score = if (object.get("score")) |value| value.integer else 0,
.num_comments = if (object.get("num_comments")) |value| value.integer else 0,
.created_utc = if (object.get("created_utc")) |value| switch (value) {
.string => try std.fmt.parseInt(i64, value.string, 10),
.integer => value.integer,
.float => @intFromFloat(value.float),
else => 0,
} else 0,
.text = if (object.get("selftext")) |value| value.string else "",
.media = if (object.get("media")) |value| value != .null else false,
};
try std.json.stringify(pmi, .{}, writer);
try writer.print("\n", .{});
}
},
else => continue,
}
}
}
try buffered_writer.flush();
}
const PostMetaInfo = struct {
title: []const u8,
subreddit: []const u8,
score: i64 = 0,
num_comments: i64 = 0,
created_utc: i64 = 0,
text: []const u8,
media: bool,
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment