Implement UTF-8
This commit is contained in:
parent
4853e42992
commit
eacbc95cea
|
@ -24,8 +24,8 @@ TODO
|
|||
* Optimization pass to turn multiply loops into commands that do `x += y * c`
|
||||
* Make VM use a Proxied object that gives out 0 for nonexistent elements for
|
||||
its memory
|
||||
* Implement UTF-8 I/O
|
||||
* Keep a cache of compiled programs in `run()`
|
||||
* Support for other types of EOF?
|
||||
|
||||
### gir.html
|
||||
* Implement a UI
|
||||
|
|
134
gir.js
134
gir.js
|
@ -442,7 +442,7 @@ function optimize(parsed) {
|
|||
// Virtual machine
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
// ([flatCommandObject]) → girVMState
|
||||
// ([flatCommandObject], [int]) → girVMState
|
||||
function newVM(program, input) {
|
||||
return {
|
||||
// Initial state for the machine
|
||||
|
@ -453,7 +453,7 @@ function newVM(program, input) {
|
|||
tapeHead: 0,
|
||||
|
||||
input: input,
|
||||
output: ''
|
||||
output: []
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -474,8 +474,9 @@ function runVM(state, maxCycles = null) {
|
|||
}
|
||||
let tapeHead = state.tapeHead;
|
||||
|
||||
let input = state.input;
|
||||
let output = state.output;
|
||||
// Create copies of input and output, since we might modify them
|
||||
let input = state.input.slice();
|
||||
let output = state.output.slice();
|
||||
|
||||
let complete = false;
|
||||
let cycle = 0;
|
||||
|
@ -526,22 +527,18 @@ function runVM(state, maxCycles = null) {
|
|||
|
||||
case writeByte:
|
||||
if(!(index in memory)) memory[index] = 0;
|
||||
// TODO: utf-8
|
||||
output += String.fromCodePoint(memory[index]);
|
||||
output.push(memory[index]);
|
||||
ip++;
|
||||
break;
|
||||
|
||||
case readByte:
|
||||
// TODO: utf-8
|
||||
// Have we reached EOF?
|
||||
if(input.length == 0) {
|
||||
// Yes, return 0
|
||||
memory[index] = 0;
|
||||
} else {
|
||||
// No, return character
|
||||
memory[index] = input.codePointAt(0);
|
||||
// FIXME: This only works for BMP
|
||||
input = input.slice(1);
|
||||
memory[index] = input.shift();
|
||||
}
|
||||
ip++;
|
||||
break;
|
||||
|
@ -589,6 +586,117 @@ function runVM(state, maxCycles = null) {
|
|||
return {state: newState, complete: complete, cycles: cycle};
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// UTF-8
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
// string → [int]
|
||||
function encodeUTF8(string) {
|
||||
let encoded = [];
|
||||
|
||||
for(let character of string) {
|
||||
let codepoint = character.codePointAt(0);
|
||||
|
||||
if(codepoint < 0x80) {
|
||||
// 0xxxxxxx
|
||||
encoded.push(codepoint);
|
||||
} else if(codepoint < 0x0800) {
|
||||
// 110xxxxx 10xxxxxx
|
||||
let b1 = codepoint >> 6 | 0b11000000;
|
||||
let b2 = codepoint & 0b00111111 | 0b10000000;
|
||||
encoded.push(b1);
|
||||
encoded.push(b2);
|
||||
} else if(codepoint < 0x10000) {
|
||||
// 1110xxxx 10xxxxxx 10xxxxxx
|
||||
let b1 = codepoint >> 12 | 0b11100000;
|
||||
let b2 = codepoint >> 6 & 0b00111111 | 0b10000000;
|
||||
let b3 = codepoint & 0b00111111 | 0b10000000;
|
||||
encoded.push(b1);
|
||||
encoded.push(b2);
|
||||
encoded.push(b3);
|
||||
} else {
|
||||
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
let b1 = codepoint >> 18 | 0b11110000;
|
||||
let b2 = codepoint >> 12 & 0b00111111 | 0b10000000;
|
||||
let b3 = codepoint >> 6 & 0b00111111 | 0b10000000;
|
||||
let b4 = codepoint & 0b00111111 | 0b10000000;
|
||||
encoded.push(b1);
|
||||
encoded.push(b2);
|
||||
encoded.push(b3);
|
||||
encoded.push(b4);
|
||||
}
|
||||
}
|
||||
|
||||
return encoded;
|
||||
}
|
||||
|
||||
// [int] → string
|
||||
function decodeUTF8(encoded) {
|
||||
let codePoints = [];
|
||||
|
||||
for(let i = 0; i < encoded.length;) {
|
||||
let codePoint = 0;
|
||||
|
||||
let firstByte = encoded[i];
|
||||
i++;
|
||||
|
||||
let toRead = null;
|
||||
// Determine number of continuation bytes to read and
|
||||
// decode the first byte into codePoint
|
||||
// Since we'll do the shifts later, we just mask here
|
||||
if(firstByte >> 7 == 0) {
|
||||
// 0xxxxxxx
|
||||
toRead = 0;
|
||||
codePoint = firstByte;
|
||||
} else if(firstByte >> 5 == 0b110) {
|
||||
// 110xxxxx 10xxxxxx
|
||||
toRead = 1;
|
||||
codePoint = firstByte & 0b00011111;
|
||||
} else if(firstByte >> 4 == 0b1110) {
|
||||
// 1110xxxx 10xxxxxx 10xxxxxx
|
||||
toRead = 2;
|
||||
codePoint = firstByte & 0b00001111;
|
||||
} else if(firstByte >> 3 == 0b11110) {
|
||||
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
toRead = 3;
|
||||
codePoint = firstByte & 0b00000111;
|
||||
} else {
|
||||
// Illegal sequence, push replacement char
|
||||
codePoints.push(0xFFFD);
|
||||
continue;
|
||||
}
|
||||
|
||||
for(; toRead > 0 && i < encoded.length; toRead--) {
|
||||
let continuationByte = encoded[i];
|
||||
i++;
|
||||
|
||||
// Check that we have a valid continuation byte
|
||||
if(continuationByte >> 6 == 0b10) {
|
||||
// We do, add its contents to codePoint
|
||||
codePoint = codePoint << 6 |
|
||||
continuationByte & 0b00111111;
|
||||
} else {
|
||||
// We don't, break out of the loop
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Did we read all required continuation bytes?
|
||||
if(toRead == 0) {
|
||||
// We did, add the codepoint to the array
|
||||
codePoints.push(codePoint);
|
||||
} else {
|
||||
// We didn't, push replacement char
|
||||
codePoints.push(0xFFFD);
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to a string
|
||||
let decoded = codePoints.map(x => String.fromCodePoint(x)).join('');
|
||||
|
||||
return decoded;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// User-facing functions
|
||||
// ------------------------------------------------------------------
|
||||
|
@ -598,14 +706,14 @@ function compile(program) {
|
|||
return optimize(parse(program));
|
||||
}
|
||||
|
||||
// (string, string, bool) → string
|
||||
// (string, string, int) → string
|
||||
function run(program, input, maxCycles = null) {
|
||||
// TODO; Cache programs
|
||||
let compiled = compile(program);
|
||||
let vm = newVM(compiled, input);
|
||||
let vm = newVM(compiled, encodeUTF8(input));
|
||||
|
||||
let result = runVM(vm, maxCycles);
|
||||
let output = result.state.output;
|
||||
let output = decodeUTF8(result.state.output);
|
||||
|
||||
// If didn't complete, mark it in the output
|
||||
if(!result.complete) {
|
||||
|
|
Loading…
Reference in New Issue