Skip to content

Instantly share code, notes, and snippets.

@mds2
Created August 31, 2024 20:55
Show Gist options
  • Select an option

  • Save mds2/34bf6d3a1a37c5f3f18d8492b9e31533 to your computer and use it in GitHub Desktop.

Select an option

Save mds2/34bf6d3a1a37c5f3f18d8492b9e31533 to your computer and use it in GitHub Desktop.
#!/usr/bin/python3
from sys import stdin
import argparse
"""Utility to convert the standard columnar output of
many UNIX commands (ls, ps, who) into a csv format that other
programs can parse.
Reads input on stdin, produces output on stdout.
Can use delimiters other than commas by specifying delimiters via
command-line flags.
Intended usage is something like the following
➜ py-play ls -alh *.py | head -n 5 | unix_to_csv.py
-rw-rw-r--,1,mschuresko,mschuresko,989,Jul,27,2023,bang_bang_rocket_sled.py
-rw-rw-r--,1,mschuresko,mschuresko,1.2K,Jul,27,2023,bang_bang_thermostat.py
-rw-rw-r--,1,mschuresko,mschuresko,1.2K,Jul,27,2023,bang_bang_treadmill.py
-rw-rw-r--,1,mschuresko,mschuresko,3.6K,Aug,24,2021,crypt_util.py
-rw-rw-r--,1,mschuresko,mschuresko,1.4K,Aug,30,2022,cycle_counter.py
"""
parser = argparse.ArgumentParser(
description="Turns UNIX command output into csv/tsv/etc")
parser.add_argument('-d', '--delimiter',
help="Delimiter for output",
default=',')
args = parser.parse_args()
# First we read stdin. Yes, we read all of it at once.
# Yes, this will be slow for large inputs, and blocking for
# streaming data, such as, for instance, the output of `tail -f`
lines = stdin.readlines()
lmax = max([len(l) for l in lines])
# We find where column boundaries are by looking for character columns
# that are spaces in every row of input (which is why we need to read
# all of input first)
# We do this by creating a set of all character columns in the input,
# and removing each column that has a non-space character in at least
# one row
spaces = set([i for i in range(lmax)])
for l in lines:
for i,c in enumerate(l):
if c != ' ' and i in spaces:
spaces.remove(i)
# Now we sort the spaces we found
# We're looking for gaps between spaces, so we add artificial
# leading and trailing spaces at -1 and lmax
spaces = [-1] + sorted(list(spaces)) + [lmax]
# Any gap between columns with spaces are data columns.
# Find those.
column_ranges = []
for i in range(len(spaces) - 1):
if spaces[i+1] - spaces[i] > 1:
column_ranges.append((spaces[i] + 1, spaces[i+1]))
# Then go through each line, capture the text in that line
# corresponding to each column of column_ranges, strip the text
# in each column, and collect it as a list of strings.
# Collect the results of each line in a list (of lists of strings)
by_cols = []
for l in lines:
curr_cols = []
for c_start, c_end in column_ranges:
curr_cols.append(l[:c_end][c_start:].strip())
by_cols.append(curr_cols)
# Now that each line is separated into columns,
# we can print them out separated by the delimiter
for row in by_cols:
print(args.delimiter.join(row))
@irisdyoung
Copy link

Hey, I think this does what you want with SQL querying :). I'm not proud of it but it works on simple ls -l output.

diff --git a/unix_to_csv.py b/unix_to_csv.py
index 5069dce..d371090 100644
--- a/unix_to_csv.py
+++ b/unix_to_csv.py
@@ -30,6 +30,12 @@ parser.add_argument('-d', '--delimiter',
                     help="Delimiter for output",
                     default=',')
 
+parser.add_argument('-q', '--query',
+                    help="Query on the resulting table in SQL syntax, \
+                    assuming columns are labeled col0, col1, etc. \
+                    and the table is named output",
+                    default=None)
+
 args = parser.parse_args()
 
 
@@ -75,6 +81,44 @@ for l in lines:
         curr_cols.append(l[:c_end][c_start:].strip())
     by_cols.append(curr_cols)
 
+# If user requests a SQL-format query to be executed on the result,
+# prepare and query the necessary table
+if args.query is not None:
+    ncols = max([len(row) for row in by_cols])
+    if ncols > 0:
+        import sqlite3
+        conn = sqlite3.connect('tmp.db')
+        cursor = conn.cursor()
+
+        # Use a column named 'rowid' for the primary key consisting
+        # of row number, and label all the rest generically
+        header_decl = ','.join(['rowid int'] + [f'col{n} text' for n in range(ncols)])
+        cursor.execute(f'CREATE TABLE output ( {header_decl} );')
+
+        # Prepare each item in each row as a string in quotes,
+        # except the row id, which we will also need to prepend
+        formatted_by_cols = [[rowid]+row for rowid, row in enumerate(by_cols)]
+        placeholder_str = ','.join(['?']*(ncols+1))
+        cursor.executemany(f'INSERT INTO output VALUES ({placeholder_str});', formatted_by_cols)
+
+        # Attempt to execute the user's requested query
+        try:
+            cursor.execute(args.query)
+            results = cursor.fetchall()
+        except Exception as e:
+            print('Could not query results as requested.')
+            print('Requested query: ', args.query)
+            print('Error message: ', e)
+            print('Table query was attempted on:')
+            cursor.execute('SELECT * from output;')
+            print(cursor.fetchall())
+            exit(1)
+        conn.commit()
+        conn.close()
+
+        # Restore results to expected format for script
+        by_cols = [[item for item in row if type(item) is str] for row in results]
+
 # Now that each line is separated into columns,
 # we can print them out separated by the delimiter
 for row in by_cols:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment