Implement UTF-8
This commit is contained in:
parent
4853e42992
commit
eacbc95cea
|
@ -24,8 +24,8 @@ TODO
|
||||||
* Optimization pass to turn multiply loops into commands that do `x += y * c`
|
* Optimization pass to turn multiply loops into commands that do `x += y * c`
|
||||||
* Make VM use a Proxied object that gives out 0 for nonexistent elements for
|
* Make VM use a Proxied object that gives out 0 for nonexistent elements for
|
||||||
its memory
|
its memory
|
||||||
* Implement UTF-8 I/O
|
|
||||||
* Keep a cache of compiled programs in `run()`
|
* Keep a cache of compiled programs in `run()`
|
||||||
|
* Support for other types of EOF?
|
||||||
|
|
||||||
### gir.html
|
### gir.html
|
||||||
* Implement a UI
|
* Implement a UI
|
||||||
|
|
134
gir.js
134
gir.js
|
@ -442,7 +442,7 @@ function optimize(parsed) {
|
||||||
// Virtual machine
|
// Virtual machine
|
||||||
// ------------------------------------------------------------------
|
// ------------------------------------------------------------------
|
||||||
|
|
||||||
// ([flatCommandObject]) → girVMState
|
// ([flatCommandObject], [int]) → girVMState
|
||||||
function newVM(program, input) {
|
function newVM(program, input) {
|
||||||
return {
|
return {
|
||||||
// Initial state for the machine
|
// Initial state for the machine
|
||||||
|
@ -453,7 +453,7 @@ function newVM(program, input) {
|
||||||
tapeHead: 0,
|
tapeHead: 0,
|
||||||
|
|
||||||
input: input,
|
input: input,
|
||||||
output: ''
|
output: []
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -474,8 +474,9 @@ function runVM(state, maxCycles = null) {
|
||||||
}
|
}
|
||||||
let tapeHead = state.tapeHead;
|
let tapeHead = state.tapeHead;
|
||||||
|
|
||||||
let input = state.input;
|
// Create copies of input and output, since we might modify them
|
||||||
let output = state.output;
|
let input = state.input.slice();
|
||||||
|
let output = state.output.slice();
|
||||||
|
|
||||||
let complete = false;
|
let complete = false;
|
||||||
let cycle = 0;
|
let cycle = 0;
|
||||||
|
@ -526,22 +527,18 @@ function runVM(state, maxCycles = null) {
|
||||||
|
|
||||||
case writeByte:
|
case writeByte:
|
||||||
if(!(index in memory)) memory[index] = 0;
|
if(!(index in memory)) memory[index] = 0;
|
||||||
// TODO: utf-8
|
output.push(memory[index]);
|
||||||
output += String.fromCodePoint(memory[index]);
|
|
||||||
ip++;
|
ip++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case readByte:
|
case readByte:
|
||||||
// TODO: utf-8
|
|
||||||
// Have we reached EOF?
|
// Have we reached EOF?
|
||||||
if(input.length == 0) {
|
if(input.length == 0) {
|
||||||
// Yes, return 0
|
// Yes, return 0
|
||||||
memory[index] = 0;
|
memory[index] = 0;
|
||||||
} else {
|
} else {
|
||||||
// No, return character
|
// No, return character
|
||||||
memory[index] = input.codePointAt(0);
|
memory[index] = input.shift();
|
||||||
// FIXME: This only works for BMP
|
|
||||||
input = input.slice(1);
|
|
||||||
}
|
}
|
||||||
ip++;
|
ip++;
|
||||||
break;
|
break;
|
||||||
|
@ -589,6 +586,117 @@ function runVM(state, maxCycles = null) {
|
||||||
return {state: newState, complete: complete, cycles: cycle};
|
return {state: newState, complete: complete, cycles: cycle};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ------------------------------------------------------------------
|
||||||
|
// UTF-8
|
||||||
|
// ------------------------------------------------------------------
|
||||||
|
|
||||||
|
// string → [int]
|
||||||
|
function encodeUTF8(string) {
|
||||||
|
let encoded = [];
|
||||||
|
|
||||||
|
for(let character of string) {
|
||||||
|
let codepoint = character.codePointAt(0);
|
||||||
|
|
||||||
|
if(codepoint < 0x80) {
|
||||||
|
// 0xxxxxxx
|
||||||
|
encoded.push(codepoint);
|
||||||
|
} else if(codepoint < 0x0800) {
|
||||||
|
// 110xxxxx 10xxxxxx
|
||||||
|
let b1 = codepoint >> 6 | 0b11000000;
|
||||||
|
let b2 = codepoint & 0b00111111 | 0b10000000;
|
||||||
|
encoded.push(b1);
|
||||||
|
encoded.push(b2);
|
||||||
|
} else if(codepoint < 0x10000) {
|
||||||
|
// 1110xxxx 10xxxxxx 10xxxxxx
|
||||||
|
let b1 = codepoint >> 12 | 0b11100000;
|
||||||
|
let b2 = codepoint >> 6 & 0b00111111 | 0b10000000;
|
||||||
|
let b3 = codepoint & 0b00111111 | 0b10000000;
|
||||||
|
encoded.push(b1);
|
||||||
|
encoded.push(b2);
|
||||||
|
encoded.push(b3);
|
||||||
|
} else {
|
||||||
|
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
let b1 = codepoint >> 18 | 0b11110000;
|
||||||
|
let b2 = codepoint >> 12 & 0b00111111 | 0b10000000;
|
||||||
|
let b3 = codepoint >> 6 & 0b00111111 | 0b10000000;
|
||||||
|
let b4 = codepoint & 0b00111111 | 0b10000000;
|
||||||
|
encoded.push(b1);
|
||||||
|
encoded.push(b2);
|
||||||
|
encoded.push(b3);
|
||||||
|
encoded.push(b4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return encoded;
|
||||||
|
}
|
||||||
|
|
||||||
|
// [int] → string
|
||||||
|
function decodeUTF8(encoded) {
|
||||||
|
let codePoints = [];
|
||||||
|
|
||||||
|
for(let i = 0; i < encoded.length;) {
|
||||||
|
let codePoint = 0;
|
||||||
|
|
||||||
|
let firstByte = encoded[i];
|
||||||
|
i++;
|
||||||
|
|
||||||
|
let toRead = null;
|
||||||
|
// Determine number of continuation bytes to read and
|
||||||
|
// decode the first byte into codePoint
|
||||||
|
// Since we'll do the shifts later, we just mask here
|
||||||
|
if(firstByte >> 7 == 0) {
|
||||||
|
// 0xxxxxxx
|
||||||
|
toRead = 0;
|
||||||
|
codePoint = firstByte;
|
||||||
|
} else if(firstByte >> 5 == 0b110) {
|
||||||
|
// 110xxxxx 10xxxxxx
|
||||||
|
toRead = 1;
|
||||||
|
codePoint = firstByte & 0b00011111;
|
||||||
|
} else if(firstByte >> 4 == 0b1110) {
|
||||||
|
// 1110xxxx 10xxxxxx 10xxxxxx
|
||||||
|
toRead = 2;
|
||||||
|
codePoint = firstByte & 0b00001111;
|
||||||
|
} else if(firstByte >> 3 == 0b11110) {
|
||||||
|
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
toRead = 3;
|
||||||
|
codePoint = firstByte & 0b00000111;
|
||||||
|
} else {
|
||||||
|
// Illegal sequence, push replacement char
|
||||||
|
codePoints.push(0xFFFD);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(; toRead > 0 && i < encoded.length; toRead--) {
|
||||||
|
let continuationByte = encoded[i];
|
||||||
|
i++;
|
||||||
|
|
||||||
|
// Check that we have a valid continuation byte
|
||||||
|
if(continuationByte >> 6 == 0b10) {
|
||||||
|
// We do, add its contents to codePoint
|
||||||
|
codePoint = codePoint << 6 |
|
||||||
|
continuationByte & 0b00111111;
|
||||||
|
} else {
|
||||||
|
// We don't, break out of the loop
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Did we read all required continuation bytes?
|
||||||
|
if(toRead == 0) {
|
||||||
|
// We did, add the codepoint to the array
|
||||||
|
codePoints.push(codePoint);
|
||||||
|
} else {
|
||||||
|
// We didn't, push replacement char
|
||||||
|
codePoints.push(0xFFFD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to a string
|
||||||
|
let decoded = codePoints.map(x => String.fromCodePoint(x)).join('');
|
||||||
|
|
||||||
|
return decoded;
|
||||||
|
}
|
||||||
|
|
||||||
// ------------------------------------------------------------------
|
// ------------------------------------------------------------------
|
||||||
// User-facing functions
|
// User-facing functions
|
||||||
// ------------------------------------------------------------------
|
// ------------------------------------------------------------------
|
||||||
|
@ -598,14 +706,14 @@ function compile(program) {
|
||||||
return optimize(parse(program));
|
return optimize(parse(program));
|
||||||
}
|
}
|
||||||
|
|
||||||
// (string, string, bool) → string
|
// (string, string, int) → string
|
||||||
function run(program, input, maxCycles = null) {
|
function run(program, input, maxCycles = null) {
|
||||||
// TODO; Cache programs
|
// TODO; Cache programs
|
||||||
let compiled = compile(program);
|
let compiled = compile(program);
|
||||||
let vm = newVM(compiled, input);
|
let vm = newVM(compiled, encodeUTF8(input));
|
||||||
|
|
||||||
let result = runVM(vm, maxCycles);
|
let result = runVM(vm, maxCycles);
|
||||||
let output = result.state.output;
|
let output = decodeUTF8(result.state.output);
|
||||||
|
|
||||||
// If didn't complete, mark it in the output
|
// If didn't complete, mark it in the output
|
||||||
if(!result.complete) {
|
if(!result.complete) {
|
||||||
|
|
Loading…
Reference in New Issue