Skip to content

Instantly share code, notes, and snippets.

@hugabor
Created March 23, 2017 01:27
Show Gist options
  • Select an option

  • Save hugabor/c763e073141fdd9261b8ec3c3c4f35e3 to your computer and use it in GitHub Desktop.

Select an option

Save hugabor/c763e073141fdd9261b8ec3c3c4f35e3 to your computer and use it in GitHub Desktop.
Returns a list of tokens from a string of a chemical compound
const UPPERCASE_LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'.split('')
const LOWERCASE_LETTERS = 'abcdefghijklmnopqrstuvwxyz'.split('')
const DIGITS = '0123456789'.split('')
const WHITESPACE = ' '.split('')
// TOKEN TYPES
const COEFF = 'COEFF'
const ELEM = 'ELEM'
const SUBSCRIPT = 'SUBSCRIPT'
const DUMMY = 'DUMMY'
function tokenize(str) {
let chars = str.split('')
let nextIndex = 0
function hasNext() {
return nextIndex < chars.length
}
function next() {
if (hasNext()) {
return chars[nextIndex++]
}
return ''
}
function stepBack() {
--nextIndex
}
function newToken(type, startIndex = nextIndex) {
return {
type : type,
str : '',
startIndex : startIndex
}
}
let tokenList = []
let token
function submitToken() {
if (token.str.length > 0) {
tokenList.push(token)
return true
}
return false
}
function addToToken(c) {
if (token.str.length == 0) {
token.startIndex = nextIndex - 1
}
token.str += c
}
function accept(validChars) {
if (hasNext()) {
let c = next()
if (validChars.indexOf(c) > -1) {
addToToken(c)
return true
} else {
stepBack()
return false
}
}
return false
}
function acceptAll(validChars) {
while (accept(validChars)) {}
}
function ignore(validChars) {
while (hasNext()) {
if (!(validChars.indexOf(next()) > -1)) {
stepBack()
return
}
}
}
function coeffStateFunc() {
token = newToken(COEFF)
acceptAll(DIGITS)
if (accept('.')) {
acceptAll(DIGITS)
}
submitToken()
return elemStateFunc
}
function elemStateFunc() {
token = newToken(ELEM)
if (accept(UPPERCASE_LETTERS)) {
acceptAll(LOWERCASE_LETTERS)
}
if (!submitToken()) {
return endStateFunc
}
return subscriptStateFunc
}
function subscriptStateFunc() {
token = newToken(SUBSCRIPT)
acceptAll(DIGITS)
submitToken()
return elemStateFunc
}
function endStateFunc() {}
function stateMachine() {
nextStateFunc = coeffStateFunc // start
ignore(WHITESPACE)
while (nextStateFunc != endStateFunc) {
nextStateFunc = nextStateFunc()
ignore(WHITESPACE)
}
if (hasNext()) {
console.log('TOKENIZER ERROR:', next(), 'is not valid!', 'Col:', nextIndex - 1)
}
}
stateMachine()
return tokenList
}
// TESTS
let testCases = [
'45.3 Cu 3 H 2 O JK',
'H 2 O',
'H 2 o',
'C6Hellothisisaninvalidelementsymbol12O6'
]
for (let str of testCases) {
console.log('-----')
console.log('Input:', str)
console.log('Output:')
console.log(tokenize(str))
console.log('=====')
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment