mds2 · August 31, 2024 20:55 · irisdyoung · Dec 2, 2024
diff --git a/unix_to_csv.py b/unix_to_csv.py
 #!/usr/bin/python3

 from sys import stdin
 import argparse

 """Utility to convert the standard columnar output of
 many UNIX commands (ls, ps, who) into a csv format that other
 programs can parse.

 Reads input on stdin, produces output on stdout.
 Can use delimiters other than commas by specifying delimiters via
 command-line flags.

 Intended usage is something like the following

 ➜  py-play ls -alh *.py | head -n 5 | unix_to_csv.py
 -rw-rw-r--,1,mschuresko,mschuresko,989,Jul,27,2023,bang_bang_rocket_sled.py
 -rw-rw-r--,1,mschuresko,mschuresko,1.2K,Jul,27,2023,bang_bang_thermostat.py
 -rw-rw-r--,1,mschuresko,mschuresko,1.2K,Jul,27,2023,bang_bang_treadmill.py
 -rw-rw-r--,1,mschuresko,mschuresko,3.6K,Aug,24,2021,crypt_util.py
 -rw-rw-r--,1,mschuresko,mschuresko,1.4K,Aug,30,2022,cycle_counter.py

 """


 parser = argparse.ArgumentParser(
    description="Turns UNIX command output into csv/tsv/etc")

 parser.add_argument('-d', '--delimiter',
                    help="Delimiter for output",
                    default=',')

 args = parser.parse_args()


 # First we read stdin.  Yes, we read all of it at once.
 # Yes, this will be slow for large inputs, and blocking for
 # streaming data, such as, for instance, the output of `tail -f`
 lines = stdin.readlines()
 lmax = max([len(l) for l in lines])

 # We find where column boundaries are by looking for character columns
 # that are spaces in every row of input (which is why we need to read
 # all of input first)

 # We do this by creating a set of all character columns in the input,
 # and removing each column that has a non-space character in at least
 # one row
 spaces = set([i for i in range(lmax)])
 for l in lines:
    for i,c in enumerate(l):
        if c != ' ' and i in spaces:
            spaces.remove(i)

 # Now we sort the spaces we found
 # We're looking for gaps between spaces, so we add artificial
 # leading and trailing spaces at -1 and lmax
 spaces = [-1] + sorted(list(spaces)) + [lmax]

 # Any gap between columns with spaces are data columns.
 # Find those.
 column_ranges = []
 for i in range(len(spaces) - 1):
    if spaces[i+1] - spaces[i] > 1:
        column_ranges.append((spaces[i] + 1, spaces[i+1]))

 # Then go through each line, capture the text in that line
 # corresponding to each column of column_ranges, strip the text
 # in each column, and collect it as a list of strings.
 # Collect the results of each line in a list (of lists of strings)
 by_cols = []
 for l in lines:
    curr_cols = []
    for c_start, c_end in column_ranges:
        curr_cols.append(l[:c_end][c_start:].strip())
    by_cols.append(curr_cols)

 # Now that each line is separated into columns,
 # we can print them out separated by the delimiter
 for row in by_cols:
    print(args.delimiter.join(row))
	#!/usr/bin/python3

	from sys import stdin
	import argparse

	"""Utility to convert the standard columnar output of
	many UNIX commands (ls, ps, who) into a csv format that other
	programs can parse.

	Reads input on stdin, produces output on stdout.
	Can use delimiters other than commas by specifying delimiters via
	command-line flags.

	Intended usage is something like the following

	➜ py-play ls -alh *.py \| head -n 5 \| unix_to_csv.py
	-rw-rw-r--,1,mschuresko,mschuresko,989,Jul,27,2023,bang_bang_rocket_sled.py
	-rw-rw-r--,1,mschuresko,mschuresko,1.2K,Jul,27,2023,bang_bang_thermostat.py
	-rw-rw-r--,1,mschuresko,mschuresko,1.2K,Jul,27,2023,bang_bang_treadmill.py
	-rw-rw-r--,1,mschuresko,mschuresko,3.6K,Aug,24,2021,crypt_util.py
	-rw-rw-r--,1,mschuresko,mschuresko,1.4K,Aug,30,2022,cycle_counter.py

	"""


	parser = argparse.ArgumentParser(
	description="Turns UNIX command output into csv/tsv/etc")

	parser.add_argument('-d', '--delimiter',
	help="Delimiter for output",
	default=',')

	args = parser.parse_args()


	# First we read stdin. Yes, we read all of it at once.
	# Yes, this will be slow for large inputs, and blocking for
	# streaming data, such as, for instance, the output of `tail -f`
	lines = stdin.readlines()
	lmax = max([len(l) for l in lines])

	# We find where column boundaries are by looking for character columns
	# that are spaces in every row of input (which is why we need to read
	# all of input first)

	# We do this by creating a set of all character columns in the input,
	# and removing each column that has a non-space character in at least
	# one row
	spaces = set([i for i in range(lmax)])
	for l in lines:
	for i,c in enumerate(l):
	if c != ' ' and i in spaces:
	spaces.remove(i)

	# Now we sort the spaces we found
	# We're looking for gaps between spaces, so we add artificial
	# leading and trailing spaces at -1 and lmax
	spaces = [-1] + sorted(list(spaces)) + [lmax]

	# Any gap between columns with spaces are data columns.
	# Find those.
	column_ranges = []
	for i in range(len(spaces) - 1):
	if spaces[i+1] - spaces[i] > 1:
	column_ranges.append((spaces[i] + 1, spaces[i+1]))

	# Then go through each line, capture the text in that line
	# corresponding to each column of column_ranges, strip the text
	# in each column, and collect it as a list of strings.
	# Collect the results of each line in a list (of lists of strings)
	by_cols = []
	for l in lines:
	curr_cols = []
	for c_start, c_end in column_ranges:
	curr_cols.append(l[:c_end][c_start:].strip())
	by_cols.append(curr_cols)

	# Now that each line is separated into columns,
	# we can print them out separated by the delimiter
	for row in by_cols:
	print(args.delimiter.join(row))
No results found