That will be easy if you use something like jieba. Example:
import jieba
# text from http://li-xirong.github.io/pub/icmr2016_chinese_caption.pdf| #!/bin/sh | |
| # Copyright 2023 Khalifah K. Shabazz | |
| # | |
| # Permission is hereby granted, free of charge, to any person obtaining a | |
| # copy of this software and associated documentation files (the “Software”), | |
| # to deal in the Software without restriction, including without limitation | |
| # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
| # and/or sell copies of the Software, and to permit persons to whom the | |
| # Software is furnished to do so, subject to the following conditions: |
| LEETCODE_USER=leetcode username | |
| GITHUB_TOKEN=github token | |
| GIST_ID=gist id | |
| GIST_FILE=progress.txt |
| https://hackernoon.com/top-10-system-design-interview-questions-for-software-engineers-8561290f0444 | |
| https://medium.com/@codingfreak/binary-tree-interview-questions-and-practice-problems-439df7e5ea1f | |
| https://cspiration.com/leetcodeClassification | |
| heap sort | |
| https://www.hackerearth.com/practice/algorithms/sorting/heap-sort/tutorial/ | |
| https://medium.com/@randerson112358/lets-build-a-min-heap-4d863cac6521 | |
| https://stackblitz.com/ |
| from typing import List, Tuple, Optional, Union, Any, ContextManager, Callable, overload | |
| import builtins | |
| import math | |
| import pickle | |
| class dtype: ... | |
| _dtype = dtype |
| #!/bin/bash | |
| # | |
| # script to extract ImageNet dataset | |
| # ILSVRC2012_img_train.tar (about 138 GB) | |
| # ILSVRC2012_img_val.tar (about 6.3 GB) | |
| # make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory | |
| # | |
| # https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md | |
| # | |
| # train/ |
| # -*- coding: utf-8 -*- | |
| # | |
| # Author: Taylor G Smith | |
| # | |
| # Recommender system ranking metrics derived from Spark source for use with | |
| # Python-based recommender libraries (i.e., implicit, | |
| # http://github.com/benfred/implicit/). These metrics are derived from the | |
| # original Spark Scala source code for recommender metrics. | |
| # https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala |
| #!/usr/bin/env python | |
| # -*- coding:UTF-8 -*- | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.init as init | |
| def weight_init(m): | |
| ''' |
| # My tmux configuration, partly based on https://github.com/wbkang/wbk-stow/blob/master/tmux-config/.tmux.conf | |
| # Scroll History | |
| set -g history-limit 50000 | |
| # show messages for 4 seconds instead | |
| set -g display-time 4000 | |
| # set first window to index 1 (not 0) to map more to the keyboard layout | |
| set-option -g renumber-windows on |
| import datetime | |
| import linecache | |
| import os | |
| import pynvml3 | |
| import torch | |
| print_tensor_sizes = True | |
| last_tensor_sizes = set() | |
| gpu_profile_fn = f'{datetime.datetime.now():%d-%b-%y-%H:%M:%S}-gpu_mem_prof.txt' |