file parser, file merge, callf command remove, SM fixes. todo: fix interpreter and analyzer with new algorithm

This commit is contained in:
ProgramSnail 2025-02-23 15:10:22 +03:00
parent 51381aea43
commit 343a21ee2d
8 changed files with 256 additions and 77 deletions

View file

@ -1,6 +1,7 @@
#pragma once
#include <ostream>
#include <vector>
extern "C" {
#include "parser.h"
@ -35,7 +36,7 @@ enum class Cmd : int8_t {
ARRAY,
FAIL,
LINE,
CALLF,
// CALLF,
PATT,
// NOTE: no longer used
// Lread,
@ -49,6 +50,8 @@ enum class Cmd : int8_t {
Bytefile *read_file(const char *fname);
Bytefile *merge_files(const std::vector<Bytefile> &bytefiles);
std::pair<Cmd, uint8_t> parse_command(char **ip, const Bytefile *bf);
std::pair<Cmd, uint8_t> parse_command(char **ip, const Bytefile *bf,
std::ostream &out);

View file

@ -255,7 +255,7 @@ static inline void **var_by_category(enum VarCategory category, size_t id) {
// s.bf->global_area_size);
}
#endif
var = s.stack + STACK_SIZE - 1 - id;
var = s.bf->global_ptr + STACK_SIZE - 1 - id;
break;
case VAR_LOCAL:
#ifndef WITH_CHECK

View file

@ -164,7 +164,7 @@ enum CMD_CTRLS {
CMD_CTRL_ARRAY,
CMD_CTRL_FAIL,
CMD_CTRL_LINE,
CMD_CTRL_CALLF,
// CMD_CTRL_CALLF,
};
enum CMD_PATTS {

View file

@ -7,17 +7,24 @@
#include "../../runtime/runtime.h"
#include "../../runtime/runtime_common.h"
typedef struct {
uint offset;
char label[0];
} Subst;
/* The unpacked representation of bytecode file */
typedef struct {
uint main_offset; /* offset of the function 'main' */
char *string_ptr; /* A pointer to the beginning of the string table */
int *imports_ptr; /* A pointer to the beginning of imports table */
int *public_ptr; /* A pointer to the beginning of publics table */
char *code_ptr; /* A pointer to the bytecode itself */
int *global_ptr; /* A pointer to the global area */
int code_size; /* The size (in bytes) of code */
uint main_offset; /* offset of the function 'main' */
char *string_ptr; /* A pointer to the beginning of the string table */
int *imports_ptr; /* A pointer to the beginning of imports table */
int *public_ptr; /* A pointer to the beginning of publics table */
char *code_ptr; /* A pointer to the bytecode itself */
void **global_ptr; /* A pointer to the global area */
char *substs_ptr; /* A pointer to the substs area */
int code_size; /* The size (in bytes) of code */
uint stringtab_size; /* The size (in bytes) of the string table */
uint global_area_size; /* The size (in words) of global area */
uint substs_area_size; /* number of required address substitutions */
uint imports_number; /* The number of imports */
uint public_symbols_number; /* The number of public symbols */
char buffer[0];
@ -64,6 +71,10 @@ static inline size_t get_public_offset_unsafe(const Bytefile *bf, size_t i) {
// read from ip
static inline void ip_write_int_unsafe(char *ip, int32_t x) {
*((int32_t *)ip) = x;
}
static inline uint16_t ip_read_half_int_unsafe(char **ip) {
*ip += sizeof(uint16_t);
return *(uint16_t *)((*ip) - sizeof(uint16_t));
@ -124,6 +135,13 @@ static inline size_t get_public_offset_safe(const Bytefile *f, size_t i) {
// read from ip
static inline void ip_write_int_safe(char *ip, int32_t x, const Bytefile *bf) {
if (ip + sizeof(int32_t) > bf->code_ptr + bf->code_size) {
failure("last command is invalid, int parameter can not be read\n");
}
ip_write_int_unsafe(ip, x);
}
static inline uint16_t ip_read_half_int_safe(char **ip, const Bytefile *bf) {
if (*ip + sizeof(uint16_t) > bf->code_ptr + bf->code_size) {
failure("last command is invalid, int parameter can not be read\n");

View file

@ -43,7 +43,6 @@ void analyze(uint32_t mod_id) {
};
auto const func_to_visit_push = [&saved_current_ip, mod_id, &visited,
&current_stack_depth,
&to_visit_func](size_t offset) {
if (visited[offset] == NOT_VISITED) {
visited[offset] = 0;
@ -258,10 +257,11 @@ void analyze(uint32_t mod_id) {
is_in_closure = (cmd == Cmd::CBEGIN);
break;
case Cmd::CLOSURE: {
uint closure_offset = ip_read_int_unsafe(&current_ip); // closure offset
size_t args_count = ip_read_int_unsafe(&current_ip); // args count
/*uint closure_offset = */ ip_read_int_unsafe(
&current_ip); // closure offset
size_t args_count = ip_read_int_unsafe(&current_ip); // args count
extra_stack_during_opr = args_count;
for (aint i = 0; i < args_count; i++) {
for (size_t i = 0; i < args_count; i++) {
aint arg_type = ip_read_byte_unsafe(&current_ip);
aint arg_id = ip_read_int_unsafe(&current_ip);
check_correct_var(arg_type, arg_id);
@ -296,7 +296,7 @@ void analyze(uint32_t mod_id) {
}
++current_stack_depth;
if (call_offset >= bf->code_size) {
if ((int)call_offset >= bf->code_size) {
ip_failure(saved_current_ip, mod_id, "jump/call out of file");
}
@ -322,18 +322,18 @@ void analyze(uint32_t mod_id) {
break;
case Cmd::LINE:
break;
case Cmd::CALLF: {
// TODO: find link to real function and replace call (need to save all
// modules in one space) <- optimization
// case Cmd::CALLF: {
// // TODO: find link to real function and replace call (need to save all
// // modules in one space) <- optimization
ip_read_int_unsafe(&current_ip); // function name (str)
uint args_count = ip_read_int_unsafe(&current_ip);
current_stack_depth -= args_count;
if (current_stack_depth < 0) {
ip_failure(saved_current_ip, mod_id, "not enough elements in stack");
}
++current_stack_depth;
} break;
// ip_read_int_unsafe(&current_ip); // function name (str)
// uint args_count = ip_read_int_unsafe(&current_ip);
// current_stack_depth -= args_count;
// if (current_stack_depth < 0) {
// ip_failure(saved_current_ip, mod_id, "not enough elements in stack");
// }
// ++current_stack_depth;
// } break;
case Cmd::PATT:
--current_stack_depth;
if (l == CMD_PATT_STR) {
@ -402,7 +402,7 @@ void analyze(uint32_t mod_id) {
bool is_call = (cmd == Cmd::CLOSURE || cmd == Cmd::CALL);
uint jmp_p = ip_read_int_unsafe(&current_ip);
if (jmp_p >= bf->code_size) {
if ((int)jmp_p >= bf->code_size) {
// NOTE: maybe also should check that > begin (?)
ip_failure(saved_current_ip, mod_id, "jump/call out of file");
}

View file

@ -527,39 +527,39 @@ void run_mod(uint mod_id, int argc, char **argv) {
// maybe some metainfo should be collected
break;
case CMD_CTRL_CALLF: { // CALLF %s %d // call external function
const char *call_func_name = ip_read_string(&s.ip);
size_t args_count = ip_read_int(&s.ip); // args count
// case CMD_CTRL_CALLF: { // CALLF %s %d // call external function
// const char *call_func_name = ip_read_string(&s.ip);
// size_t args_count = ip_read_int(&s.ip); // args count
if (run_stdlib_func(call_func_name, args_count)) {
// case of stdlib function
break;
}
// if (run_stdlib_func(call_func_name, args_count)) {
// // case of stdlib function
// break;
// }
if (strcmp(call_func_name, ".array") == 0) {
call_Barray(args_count, &s.ip, buffer);
break;
}
// if (strcmp(call_func_name, ".array") == 0) {
// call_Barray(args_count, &s.ip, buffer);
// break;
// }
struct ModSearchResult func = mod_search_pub_symbol(call_func_name);
if (func.mod_file == NULL) {
failure("RUNTIME ERROR: external function <%s> with <%zu> args not found\n", call_func_name, args_count);
}
// struct ModSearchResult func = mod_search_pub_symbol(call_func_name);
// if (func.mod_file == NULL) {
// failure("RUNTIME ERROR: external function <%s> with <%zu> args not found\n", call_func_name, args_count);
// }
call_happened = true;
s.is_closure_call = false;
s.call_ip = s.ip;
s.call_module_id = s.current_module_id;
// call_happened = true;
// s.is_closure_call = false;
// s.call_ip = s.ip;
// s.call_module_id = s.current_module_id;
s.current_module_id = func.mod_id;
s.bf = func.mod_file;
// s.current_module_id = func.mod_id;
// s.bf = func.mod_file;
if (func.symbol_offset >= s.bf->code_size) {
s_failure(&s, "jump out of file");
}
s.ip = s.bf->code_ptr + func.symbol_offset;
break;
}
// if (func.symbol_offset >= s.bf->code_size) {
// s_failure(&s, "jump out of file");
// }
// s.ip = s.bf->code_ptr + func.symbol_offset;
// break;
// }
default:
s_failure(&s, "invalid opcode"); // %d-%d\n", h, l);

View file

@ -4,6 +4,8 @@
#include <iostream>
#include <malloc.h>
#include <string.h>
#include <unordered_map>
#include <vector>
#include "parser.hpp"
@ -38,6 +40,8 @@ enum class ArgT {
// void *__start_custom_data;
// void *__stop_custom_data;
//
// Reads a binary bytecode file by name and unpacks it
Bytefile *read_file(const char *fname) {
FILE *f = fopen(fname, "rb");
@ -51,14 +55,15 @@ Bytefile *read_file(const char *fname) {
failure("read file %s: %s\n", fname, strerror(errno));
}
long size = ftell(f);
size_t size = ftell(f);
// [uint] stringtab_size
// [uint] global_area_size
// [uint] substs_area_size
// [uint] imports_number
// [uint] public_symbols_number
// char[0] buffer
long file_header_size = 4 * sizeof(uint) + sizeof(char[0]);
size_t file_header_size = 5 * sizeof(uint) + sizeof(char[0]);
long additional_size = sizeof(Bytefile) - file_header_size;
file = (Bytefile *)malloc(size +
@ -79,9 +84,10 @@ Bytefile *read_file(const char *fname) {
fclose(f);
long imports_size = file->imports_number * sizeof(int);
long public_symbols_size = file->public_symbols_number * 2 * sizeof(int);
long strings_buffer_offset = public_symbols_size + imports_size;
size_t imports_size = file->imports_number * sizeof(int);
size_t public_symbols_size = file->public_symbols_number * 2 * sizeof(int);
size_t strings_buffer_offset = public_symbols_size + imports_size;
if (file->buffer + strings_buffer_offset >= file_end) {
failure("public symbols are out of the file size\n");
}
@ -89,11 +95,18 @@ Bytefile *read_file(const char *fname) {
if (file->string_ptr + file->stringtab_size > file_end) {
failure("strings table is out of the file size\n");
}
size_t substs_buffer_offset = strings_buffer_offset + file->stringtab_size;
file->substs_ptr = file->buffer + substs_buffer_offset;
if ((char *)file->substs_ptr + file->substs_area_size > file_end) {
failure("substitutions table is out of the file size\n");
}
// if (file->stringtab_size > 0 &&
// file->string_ptr[file->stringtab_size - 1] != 0) {
// failure("strings table is not zero-ended\n");
// }
file->code_size = size - strings_buffer_offset - file->stringtab_size;
file->code_size = size - substs_buffer_offset - file->substs_area_size;
if (file->code_size < 0 || public_symbols_size < 0 ||
file->stringtab_size < 0) {
@ -109,6 +122,138 @@ Bytefile *read_file(const char *fname) {
return file;
}
struct Offsets {
size_t strings;
size_t globals;
size_t code;
};
void rewrite_code_with_offsets(Bytefile *bytefile, const Offsets &offsets) {
char *ip = bytefile->code_ptr;
while (ip - bytefile->code_ptr < bytefile->code_size) {
const auto [cmd, l] = parse_command(&ip, bytefile);
char *cmd_ip = ip;
switch (cmd) {
case Cmd::STRING:
ip_write_int_unsafe(cmd_ip, ip_read_int_unsafe(&ip) +
offsets.strings); // TODO: check
break;
case Cmd::JMP:
case Cmd::CJMPnz:
case Cmd::CJMPz:
case Cmd::CLOSURE:
case Cmd::CALL:
ip_write_int_unsafe(cmd_ip, ip_read_int_unsafe(&ip) +
offsets.code); // TODO: check
break;
default:
break;
}
}
}
void subst_in_code(Bytefile *bytefile,
const std::unordered_map<std::string, size_t> &publics) {
for (size_t i = 0; i < bytefile->substs_area_size;) {
if (i + sizeof(uint32_t) >= bytefile->substs_area_size) {
failure("substitution %zu offset is out of area", i);
}
uint32_t offset = *(uint32_t *)(bytefile->substs_ptr + i);
i += sizeof(uint32_t);
const char *name = bytefile->substs_ptr + i;
i += strlen(name);
if (i > bytefile->substs_area_size) {
failure("substitution %zu name is out of area", i);
}
const auto it = publics.find(name);
if (it == publics.end()) {
failure("public name for substitution is not found: %s", name);
}
*(uint32_t *)(bytefile->code_ptr + offset) = it->second;
// TODO: check: +4 to match ?
}
}
Offsets calc_merge_sizes(const std::vector<Bytefile *> &bytefiles) {
Offsets sizes{.strings = 0, .globals = 0, .code = 0};
for (size_t i = 0; i < bytefiles.size(); ++i) {
sizes.strings += bytefiles[i]->stringtab_size;
sizes.strings += bytefiles[i]->global_area_size;
sizes.strings += bytefiles[i]->code_size;
}
return sizes;
}
Bytefile *merge_files(std::vector<Bytefile *> &&bytefiles) {
Offsets sizes = calc_merge_sizes(bytefiles);
Bytefile *result = (Bytefile *)malloc(sizeof(Bytefile) + sizes.strings +
sizes.code); // globals - on stack
// collect publics
std::unordered_map<std::string, size_t> publics;
std::vector<size_t> main_offsets;
{
size_t code_offset = 0;
for (size_t i = 0; i < bytefiles.size(); ++i) {
for (size_t j = 0; j < bytefiles[i]->public_symbols_number; ++j) {
const char *name = get_public_name_unsafe(bytefiles[i], j);
size_t offset =
get_public_name_offset_unsafe(bytefiles[i], j) + code_offset;
if (strcmp(name, "main") == 0) {
main_offsets.push_back(offset);
} else if (!publics.insert({name, offset}).second) {
failure("public name found more then once: %s", name);
}
}
code_offset += bytefiles[i]->code_size;
}
}
// init result
result->code_size = sizes.code;
result->stringtab_size = sizes.strings;
result->global_area_size = sizes.globals;
result->substs_area_size = 0;
result->imports_number = 0;
result->public_symbols_number = 0;
result->main_offset = 0; // TODO: save al main offsets in some way (?)
result->string_ptr = result->buffer;
result->imports_ptr = NULL;
result->public_ptr = NULL;
result->code_ptr = result->string_ptr + result->stringtab_size;
result->global_ptr = NULL;
result->substs_ptr = NULL;
// update & merge code segments
Offsets offsets{.strings = 0, .globals = 0, .code = 0};
for (size_t i = 0; i < bytefiles.size(); ++i) {
rewrite_code_with_offsets(bytefiles[i], offsets);
subst_in_code(bytefiles[i], publics);
// copy data to merged file
memcpy(result->string_ptr + offsets.strings, bytefiles[i]->string_ptr,
bytefiles[i]->stringtab_size);
memcpy(result->code_ptr + offsets.code, bytefiles[i]->code_ptr,
bytefiles[i]->code_size);
// update offsets
offsets.strings += bytefiles[i]->stringtab_size;
offsets.globals += bytefiles[i]->global_area_size;
offsets.code += bytefiles[i]->code_size;
free(bytefiles[i]);
}
return result;
}
const char *command_name(Cmd cmd, int8_t l) {
static const char *const ops[] = {
#define OP_TO_STR(id, op) "BINOP:" #op,
@ -212,8 +357,8 @@ const char *command_name(Cmd cmd, int8_t l) {
return "FAIL";
case Cmd::LINE:
return "LINE";
case Cmd::CALLF:
return "CALLF";
// case Cmd::CALLF:
// return "CALLF";
case Cmd::PATT:
if (l >= sizeof(pats) / sizeof(char *)) {
return "_UNDEF_PATT_";
@ -455,7 +600,7 @@ std::pair<Cmd, uint8_t> parse_command_impl(char **ip, const Bytefile &bf,
case CMD_CTRL:
switch (l) {
case CMD_CTRL_CJMPz: // CJMPnz 0x%.8x
case CMD_CTRL_CJMPz: // CJMPz 0x%.8x
cmd = Cmd::CJMPz;
read_print_cmd_seq_opt<do_read_args, use_out, ArgT::OFFSET>(cmd, l, ip,
bf, out);
@ -532,11 +677,12 @@ std::pair<Cmd, uint8_t> parse_command_impl(char **ip, const Bytefile &bf,
read_print_cmd_seq_opt<do_read_args, use_out, ArgT::INT>(cmd, l, ip, bf,
out);
break;
case CMD_CTRL_CALLF: // CALLF %s %d
cmd = Cmd::CALLF;
read_print_cmd_seq_opt<do_read_args, use_out, ArgT::STR, ArgT::INT>(
cmd, l, ip, bf, out);
break;
// NOTE: is replaced
// case CMD_CTRL_CALLF: // CALLF %s %d
// cmd = Cmd::CALLF;
// read_print_cmd_seq_opt<do_read_args, use_out, ArgT::STR, ArgT::INT>(
// cmd, l, ip, bf, out);
// break;
default:
failure("invalid opcode");
@ -574,8 +720,8 @@ std::pair<Cmd, uint8_t> parse_command_impl(char **ip, const Bytefile &bf,
// case CMD_BUILTIN_Barray: // CALL Barray %d
// cmd = Cmd::Barray;
// read_print_cmd_seq_opt<do_read_args, use_out, ArgT::INT>(cmd, l, ip,
// bf,
// read_print_cmd_seq_opt<do_read_args, use_out, ArgT::INT>(cmd, l,
// ip, bf,
// out);
// break;
@ -613,6 +759,7 @@ bool is_command_name(char *ip, const Bytefile *bf, Cmd cmd) {
void print_file_info(const Bytefile &bf, std::ostream &out) {
out << "String table size : " << bf.stringtab_size << '\n';
out << "Global area size : " << bf.global_area_size << '\n';
out << "Substitutions area size : " << bf.substs_area_size << '\n';
out << "Number of imports : " << bf.imports_number << '\n';
out << "Number of public symbols: " << bf.public_symbols_number << '\n';