bittlingmayer · April 23, 2020 14:18
diff --git a/README.md b/README.md
diff --git a/split.sh b/split.sh
 original_file=$1

 test_size=100
 min_len=1000

 line_count=`wc -l ${original_file} | awk '{print $1;}'`
 if [[ ${line_count} -le ${min_len} ]]; then
    echo "Not splitting for test.  Too few lines in ${original_file}"
    return
 fi

 shuffled_file=${original_file}.shuf
 test_file=${original_file}.test
 train_file=${original_file}.train

 echo "Shuffling ${original_file}"
 # on Mac there is no `shuf` by default, so using gshuf instead
 # or `sort` as a last resort.  (`sort -R` is not real shuffle, and is slow)
 if [[ `command -v shuf` ]]; then shuf ${original_file} > ${shuffled_file};
 elif [[ `command -v gshuf` ]]; then gshuf -R ${original_file} > ${shuffled_file};
 elif [[ `command -v sort` ]]; then sort -R ${original_file} > ${shuffled_file}; fi

 head -n ${test_size} ${shuffled_file} > ${test_file}

 # This is the trick.
 grep -Fvxf ${test_file} ${original_file} > ${train_file}

 wc -l ${test_file} ${train_file}

 rm ${shuffled_file}
	original_file=$1

	test_size=100
	min_len=1000

	line_count=`wc -l ${original_file} \| awk '{print $1;}'`
	if [[ ${line_count} -le ${min_len} ]]; then
	echo "Not splitting for test. Too few lines in ${original_file}"
	return
	fi

	shuffled_file=${original_file}.shuf
	test_file=${original_file}.test
	train_file=${original_file}.train

	echo "Shuffling ${original_file}"
	# on Mac there is no `shuf` by default, so using gshuf instead
	# or `sort` as a last resort. (`sort -R` is not real shuffle, and is slow)
	if [[ `command -v shuf` ]]; then shuf ${original_file} > ${shuffled_file};
	elif [[ `command -v gshuf` ]]; then gshuf -R ${original_file} > ${shuffled_file};
	elif [[ `command -v sort` ]]; then sort -R ${original_file} > ${shuffled_file}; fi

	head -n ${test_size} ${shuffled_file} > ${test_file}

	# This is the trick.
	grep -Fvxf ${test_file} ${original_file} > ${train_file}

	wc -l ${test_file} ${train_file}

	rm ${shuffled_file}
No results found