Created
March 23, 2017 01:27
-
-
Save hugabor/c763e073141fdd9261b8ec3c3c4f35e3 to your computer and use it in GitHub Desktop.
Returns a list of tokens from a string of a chemical compound
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const UPPERCASE_LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'.split('') | |
| const LOWERCASE_LETTERS = 'abcdefghijklmnopqrstuvwxyz'.split('') | |
| const DIGITS = '0123456789'.split('') | |
| const WHITESPACE = ' '.split('') | |
| // TOKEN TYPES | |
| const COEFF = 'COEFF' | |
| const ELEM = 'ELEM' | |
| const SUBSCRIPT = 'SUBSCRIPT' | |
| const DUMMY = 'DUMMY' | |
| function tokenize(str) { | |
| let chars = str.split('') | |
| let nextIndex = 0 | |
| function hasNext() { | |
| return nextIndex < chars.length | |
| } | |
| function next() { | |
| if (hasNext()) { | |
| return chars[nextIndex++] | |
| } | |
| return '' | |
| } | |
| function stepBack() { | |
| --nextIndex | |
| } | |
| function newToken(type, startIndex = nextIndex) { | |
| return { | |
| type : type, | |
| str : '', | |
| startIndex : startIndex | |
| } | |
| } | |
| let tokenList = [] | |
| let token | |
| function submitToken() { | |
| if (token.str.length > 0) { | |
| tokenList.push(token) | |
| return true | |
| } | |
| return false | |
| } | |
| function addToToken(c) { | |
| if (token.str.length == 0) { | |
| token.startIndex = nextIndex - 1 | |
| } | |
| token.str += c | |
| } | |
| function accept(validChars) { | |
| if (hasNext()) { | |
| let c = next() | |
| if (validChars.indexOf(c) > -1) { | |
| addToToken(c) | |
| return true | |
| } else { | |
| stepBack() | |
| return false | |
| } | |
| } | |
| return false | |
| } | |
| function acceptAll(validChars) { | |
| while (accept(validChars)) {} | |
| } | |
| function ignore(validChars) { | |
| while (hasNext()) { | |
| if (!(validChars.indexOf(next()) > -1)) { | |
| stepBack() | |
| return | |
| } | |
| } | |
| } | |
| function coeffStateFunc() { | |
| token = newToken(COEFF) | |
| acceptAll(DIGITS) | |
| if (accept('.')) { | |
| acceptAll(DIGITS) | |
| } | |
| submitToken() | |
| return elemStateFunc | |
| } | |
| function elemStateFunc() { | |
| token = newToken(ELEM) | |
| if (accept(UPPERCASE_LETTERS)) { | |
| acceptAll(LOWERCASE_LETTERS) | |
| } | |
| if (!submitToken()) { | |
| return endStateFunc | |
| } | |
| return subscriptStateFunc | |
| } | |
| function subscriptStateFunc() { | |
| token = newToken(SUBSCRIPT) | |
| acceptAll(DIGITS) | |
| submitToken() | |
| return elemStateFunc | |
| } | |
| function endStateFunc() {} | |
| function stateMachine() { | |
| nextStateFunc = coeffStateFunc // start | |
| ignore(WHITESPACE) | |
| while (nextStateFunc != endStateFunc) { | |
| nextStateFunc = nextStateFunc() | |
| ignore(WHITESPACE) | |
| } | |
| if (hasNext()) { | |
| console.log('TOKENIZER ERROR:', next(), 'is not valid!', 'Col:', nextIndex - 1) | |
| } | |
| } | |
| stateMachine() | |
| return tokenList | |
| } | |
| // TESTS | |
| let testCases = [ | |
| '45.3 Cu 3 H 2 O JK', | |
| 'H 2 O', | |
| 'H 2 o', | |
| 'C6Hellothisisaninvalidelementsymbol12O6' | |
| ] | |
| for (let str of testCases) { | |
| console.log('-----') | |
| console.log('Input:', str) | |
| console.log('Output:') | |
| console.log(tokenize(str)) | |
| console.log('=====') | |
| } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment