Last active
June 11, 2024 23:37
-
-
Save arrufat/23eb99a5635c2e1671e7fdb6f2e017ef to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sys import argv, exit | |
| import json | |
| if len(argv) < 2: | |
| exit() | |
| with open("out.jsonl", "w") as out: | |
| for arg in argv[1:]: | |
| print(arg) | |
| with open(arg, "r") as jsonl: | |
| for line in jsonl: | |
| root = json.loads(line) | |
| if "title" in root: | |
| title: str = root["title"] | |
| if not title.endswith("?"): | |
| continue | |
| if title == "[deleted by user]": | |
| continue | |
| pmi = { | |
| "title": title, | |
| "subreddit": root["subreddit"], | |
| "score": root["score"], | |
| "num_comments": root["num_comments"], | |
| "created_utc": root["created_utc"], | |
| "text": root["selftext"], | |
| "media": root["media"] is not None, | |
| } | |
| out.write(json.dumps(pmi)) | |
| out.write("\n") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const std = @import("std"); | |
| pub fn main() !void { | |
| var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | |
| defer _ = gpa.deinit(); | |
| const allocator = gpa.allocator(); | |
| var args = try std.process.argsAlloc(allocator); | |
| defer std.process.argsFree(allocator, args); | |
| if (args.len < 2) return; | |
| var line = std.ArrayList(u8).init(allocator); | |
| defer line.deinit(); | |
| var out_file = try std.fs.cwd().createFile("output.jsonl", .{ .read = false }); | |
| defer out_file.close(); | |
| var buffered_writer = std.io.bufferedWriter(out_file.writer()); | |
| const writer = buffered_writer.writer(); | |
| for (args[1..]) |file_path| { | |
| std.log.debug("file_path: {s}\n", .{file_path}); | |
| const file = try std.fs.cwd().openFile(file_path, .{ .mode = .read_only }); | |
| defer file.close(); | |
| var buffered_reader = std.io.bufferedReader(file.reader()); | |
| const reader = buffered_reader.reader(); | |
| while (true) { | |
| defer line.clearRetainingCapacity(); | |
| reader.streamUntilDelimiter(line.writer(), '\n', null) catch |err| switch (err) { | |
| error.EndOfStream => break, | |
| else => return err, | |
| }; | |
| const root = try std.json.parseFromSlice(std.json.Value, allocator, line.items, .{}); | |
| defer root.deinit(); | |
| switch (root.value) { | |
| .object => |object| { | |
| if (object.get("title")) |title| { | |
| if (std.mem.eql(u8, title.string, "[deleted by user]")) continue; | |
| if (!std.mem.endsWith(u8, title.string, "?")) continue; | |
| const pmi = PostMetaInfo{ | |
| .title = title.string, | |
| .subreddit = if (object.get("subreddit")) |value| value.string else "", | |
| .score = if (object.get("score")) |value| value.integer else 0, | |
| .num_comments = if (object.get("num_comments")) |value| value.integer else 0, | |
| .created_utc = if (object.get("created_utc")) |value| switch (value) { | |
| .string => try std.fmt.parseInt(i64, value.string, 10), | |
| .integer => value.integer, | |
| .float => @intFromFloat(value.float), | |
| else => 0, | |
| } else 0, | |
| .text = if (object.get("selftext")) |value| value.string else "", | |
| .media = if (object.get("media")) |value| value != .null else false, | |
| }; | |
| try std.json.stringify(pmi, .{}, writer); | |
| try writer.print("\n", .{}); | |
| } | |
| }, | |
| else => continue, | |
| } | |
| } | |
| } | |
| try buffered_writer.flush(); | |
| } | |
| const PostMetaInfo = struct { | |
| title: []const u8, | |
| subreddit: []const u8, | |
| score: i64 = 0, | |
| num_comments: i64 = 0, | |
| created_utc: i64 = 0, | |
| text: []const u8, | |
| media: bool, | |
| }; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment