diff --git a/doc/spec/01.01.general_characteristic.tex b/doc/spec/01.01.general_characteristics.tex similarity index 100% rename from doc/spec/01.01.general_characteristic.tex rename to doc/spec/01.01.general_characteristics.tex diff --git a/doc/spec/03.01.lexical_structure.tex b/doc/spec/03.01.lexical_structure.tex new file mode 100644 index 000000000..11f8fa115 --- /dev/null +++ b/doc/spec/03.01.lexical_structure.tex @@ -0,0 +1,110 @@ +\section{Lexical Structure} + +The character set for the language is \textsc{ASCII}, case-sensitive. In the following lexical description we will use +the GNU Regexp syntax~\cite{GNULib} in lexical definitions. + +\subsection{Whitespaces and Comments} + +Whitespaces and comments are \textsc{ASCII} sequences which serve as delimiters for other tokens but otherwise are +ignored. + +The following characters are treated as whitespaces: + +\begin{itemize} +\item blank character "\texttt{ }"; +\item newline character "\texttt{\textbackslash n}"; +\item tabulation character "\texttt{\textbackslash t}". +\end{itemize} + +Additionally, two kinds of comments are recognized: + +\begin{itemize} +\item end-of-line comment "\texttt{--}" escapes the rest of the line, including itself; +\item block comment "\texttt{(*} ... \texttt{*)}" escapes all the text between + "\texttt{(*}" and "\texttt{*)}". +\end{itemize} + +There is a number of specific cases which have to be considered explicitly. + +First, block comments can be properly nested. Then, the occurencies of comment symbols inside string literals (see below) do not +considered as comments. + +End-of-line comment encountered \emph{outside} of a block comment escapes block comment symbols: + +\begin{lstlisting} + -- the following symbols are not considered as a block comment: (* + -- same here: *) +\end{lstlisting} + +Similarly, an end-of-line comment encountered inside a block comment is escaped: + +\begin{lstlisting} + (* Block comment starts here ... + -- and ends here: *) +\end{lstlisting} + +\subsection{Identifiers and Constants} + +The language distinguishes identifiers, signed decimal literals, string and character literals (see Fig.~\ref{idents_and_consts}). There are +two kinds of identifiers: those beginning with uppercase characters (\token{UIDENT}) and lowercase characters (\token{LIDENT}). + +String literals cannot span multiple lines; a blockquote character (") inside a string literal has to be doubled to prevent from +being considered as this literal's delimiter. + +Character literals as a rule are comprised of a single \textsc{ASCII} character; if this character is a quote (') it has to be doubled. Additionally +two-character abbreviations "\textbackslash t" and "\textbackslash n" are recognized and converted into a single-character representation. + +\begin{figure}[t] + \[ + \begin{array}{rcl} + \token{UIDENT} & = &\mbox{\texttt{[A-Z][a-zA-Z\_0-9]*}}\\ + \token{LIDENT} & = &\mbox{\texttt{[a-z][a-zA-Z\_0-9]*}}\\ + \token{DECIMAL}& = &\mbox{\texttt{-?[0-9]+}}\\ + \token{STRING} & = &\mbox{\texttt{"([\^{}\textbackslash"]|"")*"}}\\ + \token{CHAR} & = &\mbox{\texttt{'([\^{}']|''|\textbackslash n|\textbackslash t)'}} + \end{array} + \] + \caption{Identifiers and constants} + \label{idents_and_consts} +\end{figure} + + +\subsection{Keywords} + +The following identifiers are reserved for keywords: + +\begin{lstlisting} + after array at before boxed case do elif else + esac false fi for fun if import infix infixl + infixr length local od of public repeat return sexp + skip string string then true unboxed until when while +\end{lstlisting} + +\subsection{Infix Operators} + +Infix operators defined as follows: + +\[ +\token{INFIX}=\mbox{\texttt{[+*/\%\$\#@!|\&\^{}~?<>:=\textbackslash-]+}} +\] + +There is a predefined set of built-in infix operators (see~\ref{binary_expressions}); additionally +an end-used can define custom infix operators (see~\ref{custom_infix}). Note, sometimes +additional whitespaces are required to disambiguate infix operator applications. For example, if a +custom infix operator "\lstinline|+-|" is defined, then the expression "\lstinline|a +- b|" can no longer be +considered as "\lstinline|a +(-b)|". Note also that a custom operator "\lstinline|--|" can not be +defined due to lexical conventions. + +\subsection{Delimiters} + +The following symbols are treated as delimiters: + +\begin{lstlisting} + . , ( ) { } + ; # -> +\end{lstlisting} + +Despite custom infix operators can coincide with delimiters "\lstinline|#|" and "\lstinline|->|" they can +never clash as both these delimiters can not be encountered in expressions. + + diff --git a/doc/spec/03.02.compilation_units.tex b/doc/spec/03.02.compilation_units.tex new file mode 100644 index 000000000..b9bde8b68 --- /dev/null +++ b/doc/spec/03.02.compilation_units.tex @@ -0,0 +1,21 @@ +\begin{figure}[t] + \[ + \begin{array}{rcl} + \defterm{compilationUnit} & : & \nonterm{import}^\star\s\nonterm{scopeExpression}\\ + \defterm{import} & : & \term{import}\s\token{UIDENT}\s\term{;} + \end{array} + \] + \caption{Compilation unit concrete syntax} + \label{compilation_unit} +\end{figure} + +\section{Compilation Units} + +Compilation unit is a minimal structure recognized by a parser. An application can contain multiple units, compiled separatedly. +In order to use other units they have to be imported. In particular, the standard library is comprized of a number of precompiled units, +which can be imported by an end-user application. + +The concrete syntax for compilation unit is show on Fig.~\ref{compilation_unit}. Besides optional imports a unit must contain +a \nonterm{scopeExpression}, which may contain some definitions and computations. Note, a unit can not be empty. The computations described in +a unit are performed at unit initialization time (see~\ref{separate_compilation}). + diff --git a/doc/spec/03.03.scope_expressions.tex b/doc/spec/03.03.scope_expressions.tex new file mode 100644 index 000000000..16d17d924 --- /dev/null +++ b/doc/spec/03.03.scope_expressions.tex @@ -0,0 +1,105 @@ +\begin{figure}[t] + \[ + \begin{array}{rcl} + \defterm{scopeExpression} & : & \nonterm{definition}^\star\s\nonterm{expression}\\ + \defterm{definition} & : & \nonterm{variableDefinition}\alt\nonterm{functionDefinition}\alt\nonterm{infixDefinition}\\ + \defterm{variableDefinition} & : & (\s\term{local}\alt\term{public}\s)\s\nonterm{variableDefinitionSequence}\s\term{;}\\ + \defterm{variableDefinitionSequence} & : & \nonterm{variableDefinitionSequenceItem}\s(\s\term{,}\s\nonterm{variableDefinitionSequenceItem}\s)^\star\\ + \defterm{variableDefinitionSequenceItem} & : & \token{LIDENT}\s[\s\term{=}\s\nonterm{basicExpression}\s]\\ + \defterm{functionDefinition} & : & [\s\term{public}\s]\s\term{fun}\s\token{LIDENT}\s\term{(}\s\nonterm{functionArguments}\s\term{)}\s\nonterm{functionBody}\\ + \defterm{functionArguments} & : & [\s\token{LIDENT}\s(\s\term{,}\s\token{LIDENT}\s)^\star\s]\\ + \defterm{functionBody} & : & \term{\{}\s\nonterm{scopeExpression}\s\term{\}} + \end{array} + \] + \caption{Scope expression concrete syntax} + \label{scope_expression} +\end{figure} + +\section{Scope Expressions} + +Scope expressions provide a mean to put expressions is a scoped context. The definitions in scoped expressions comprise of function definitions and +variable definitions (see Fig.~\ref{scope_expression}). For example: + +\begin{lstlisting} + local x, y, z; -- variable definitions + + fun id (x) {x} -- function definition +\end{lstlisting} + +As scope expressions are expressions, they can be nested: + +\begin{lstlisting} + local x; + + { -- nested scope begins here + local y; + skip + } -- nested scope ends here +\end{lstlisting} + +The definitions on the top-level of compilation unit can be tagged as ``\lstinline|public|'', in which case they are exported and become visible by +other units which import the given one. Nested scopes can not contain public definitions. + +The nesting relation has the shape of a tree, and in a concrete node of the tree all definitions in all enclosing scopes are visible: + +\begin{lstlisting} + local x; + + {local y; + {local z; + skip -- x, y, and z are visible here + }; + {local t; + skip -- x, y, and t are visible here + }; + skip -- x and y are visible here + }; + skip -- only x is visible here +\end{lstlisting} + +Multiple definitions of the same name in the same scope are prohibited: + +\begin{lstlisting} + local x; + fun x () {0} -- error +\end{lstlisting} + +However, a definition is a nested scope can override a definition in an enclosing one: + +\begin{lstlisting} + local x; + + { + fun x () {0} -- ok + skip -- here x is associated with the function + }; + + skip -- here x is asociated with the variable +\end{lstlisting} + +A function can freely use all visible definitions; in particular, functions defined in the +same scope can be mutually recursive: + +\begin{lstlisting} + local x; + fun f () {0} + + { + fun g () {f () + h () + y} -- ok + fun h () {g () + x} -- ok + local y; + skip + }; + skip +\end{lstlisting} + +A variable, defined in a scope, can be attributed with an expression, calcualting its initial value. +These expressions, however, are evaluated in the order of variable declaration. Thus, while +technically it is possible to have forward references in the initialization expression, their +behaviour is undefined. For example: + +\begin{lstlisting} + local x = y + 2; -- undefined, as y is not yet initialized at this point + local y = x + 2; + skip +\end{lstlisting} diff --git a/doc/spec/03.04.expressions.tex b/doc/spec/03.04.expressions.tex new file mode 100644 index 000000000..37bb295be --- /dev/null +++ b/doc/spec/03.04.expressions.tex @@ -0,0 +1,45 @@ +\begin{figure}[t] + \[ + \begin{array}{rcll} + \defterm{expression} & : & \nonterm{basicExpression}\s(\s\term{;}\s\nonterm{expression}\s)&\\ + \defterm{basicExpression} & : & \nonterm{binaryExpression}&\\ + \defterm{binaryExpression} & : & \nonterm{binaryOperand}\s\token{INFIX}\s\nonterm{binaryOperand}&\alt\\ + & & \nonterm{binaryOperand}&\\ + \defterm{binaryOperand} & : & \nonterm{binaryExpression}&\alt\\ + & & [\s\term{-}\s]\s\nonterm{postfixExpression}&\\ + \defterm{postfixExpression} & : & \nonterm{primary}&\alt\\ + & & \nonterm{postfixExpression}\s\term{(}\s[\s\nonterm{expression}\s(\s\term{,}\s\nonterm{expression}\s)^\star\s]\s\term{)}&\alt\\ + & & \nonterm{postfixExpression}\s\term{[}\s\nonterm{expression}\s\term{]}&\alt\\ + & & \nonterm{postfixExpression}\s\term{.}\s\term{length}&\alt\\ + & & \nonterm{postfixExpression}\s\term{.}\s\term{string}&\\ + \defterm{primary} & : & \token{DECIMAL}&\alt\\ + & & \token{STRING}&\alt\\ + & & \token{CHAR}&\alt\\ + & & \token{LIDENT}&\alt\\ + & & \term{true}&\alt\\ + & & \term{false}&\alt\\ + & & \term{infix}\s\token{INFIX}&\alt\\ + & & \term{skip}&\alt\\ + & & \term{return}\s[\s\nonterm{basicExpression}\s]&\alt\\ + & & \term{fun}\s\term{(}\s\nonterm{functionArguments}\s\term{)}\s\nonterm{functionBody}&\alt\\ + & & \term{\{}\s\nonterm{scopeExpression}\s\term{\}}&\alt\\ + & & \nonterm{listExpression}&\alt\\ + & & \nonterm{arrayExpression}&\alt\\ + & & \nonterm{S-expression}&\alt\\ + & & \nonterm{ifExpression}&\alt\\ + & & \nonterm{whileExpression}&\alt\\ + & & \nonterm{repeatExpression}&\alt\\ + & & \nonterm{forExpression}&\alt\\ + & & \nonterm{caseExpression}&\alt\\ + & & \term{(}\s\nonterm{expression}\s\term{)}& + \end{array} + \] + \caption{Expression concrete syntax} + \label{expressions} +\end{figure} + +\section{Expressions} +\label{sec:expressions} + +Expressions + diff --git a/doc/spec/03.concrete_syntax.tex b/doc/spec/03.concrete_syntax.tex index 338389e62..537bba2e6 100644 --- a/doc/spec/03.concrete_syntax.tex +++ b/doc/spec/03.concrete_syntax.tex @@ -1,71 +1,26 @@ \chapter{Concrete Syntax} -\begin{figure}[t] - \[ - \begin{array}{rcl} - \defterm{compilationUnit} & : & \nonterm{import}^\star\s\nonterm{scopeExpression}\\ - \defterm{import} & : & \term{import}\s\token{UIDENT}\s\term{;} - \end{array} - \] - \caption{Compilation unit concrete syntax} -\end{figure} +In this chapter we describe the concrete syntax of the language as it is recognized by the parser. In the +syntactic description we will use extended Backus-Naur form with the following conventions: -\begin{figure}[t] - \[ - \begin{array}{rcll} - \defterm{expression} & : & \nonterm{basicExpression}\s(\s\term{;}\s\nonterm{expression}\s)&\\ - \defterm{basicExpression} & : & \nonterm{binaryExpression}&\\ - \defterm{binaryExpression} & : & \nonterm{binaryOperand}\s\token{INFIX}\s\nonterm{binaryOperand}&\alt\\ - & & \nonterm{binaryOperand}&\\ - \defterm{binaryOperand} & : & \nonterm{binaryExpression}&\alt\\ - & & [\s\term{-}\s]\s\nonterm{postfixExpression}&\\ - \defterm{postfixExpression} & : & \nonterm{primary}&\alt\\ - & & \nonterm{postfixExpression}\s\term{(}\s[\s\nonterm{expression}\s(\s\term{,}\s\nonterm{expression}\s)^\star\s]\s\term{)}&\alt\\ - & & \nonterm{postfixExpression}\s\term{[}\s\nonterm{expression}\s\term{]}&\alt\\ - & & \nonterm{postfixExpression}\s\term{.}\s\term{length}&\alt\\ - & & \nonterm{postfixExpression}\s\term{.}\s\term{string}&\\ +\begin{itemize} +\item nonterminals are presented in \nonterm{italics}; +\item concrete terminals are \term{grayed out}; +\item classes of terminals are \token{CAPITALIZED}; +\item a postfix ``$^\star$'' designates zero-or-more repetitions; +\item square brackets ``$[\dots]$'' designate zero-or-one repetition; +\item round brackets ``$(\dots)$'' are used for grouping; +\item alteration is denoted by ``$\alt$'', sequencing by juxaposition; +\item a colon ``$:$'' separates a nonterminal being defined from its definition. +\end{itemize} - \defterm{primary} & : & \token{DECIMAL}&\alt\\ - & & \token{STRING}&\alt\\ - & & \token{CHAR}&\alt\\ - & & \token{LIDENT}&\alt\\ - & & \term{true}&\alt\\ - & & \term{false}&\alt\\ - & & \term{infix}\s\token{INFIX}&\alt\\ - & & \term{skip}&\alt\\ - & & \term{fun}\s\term{(}\s\nonterm{functionArguments}\s\term{)}\s\nonterm{functionBody}&\alt\\ - & & \term{\{}\s\nonterm{scopeExpression}\s\term{\}}&\alt\\ - & & \nonterm{listExpression}&\alt\\ - & & \nonterm{arrayExpression}&\alt\\ - & & \nonterm{S-expression}&\alt\\ - & & \nonterm{ifExpression}&\alt\\ - & & \nonterm{whileExpression}&\alt\\ - & & \nonterm{repeatExpression}&\alt\\ - & & \nonterm{forExpression}&\alt\\ - & & \nonterm{caseExpression}&\alt\\ - & & \term{(}\s\nonterm{expression}\s\term{)}& - \end{array} - \] - \caption{Expression concrete syntax} -\end{figure} - - -\begin{figure}[t] - \[ - \begin{array}{rcl} - \defterm{scopeExpression} & : & \nonterm{definition}^\star\s\nonterm{expression}\\ - \defterm{definition} & : & \nonterm{variableDefinition}\alt\nonterm{functionDefinition}\alt\nonterm{infixDefinition}\\ - \defterm{variableDefinition} & : & (\s\term{local}\alt\term{public}\s)\s\nonterm{variableDefinitionSequence}\s\term{;}\\ - \defterm{variableDefinitionSequence} & : & \nonterm{variableDefinitionSequenceItem}\s(\s\term{,}\s\nonterm{variableDefinitionSequenceItem}\s)^\star\\ - \defterm{variableDefinitionSequenceItem} & : & \token{LIDENT}\s[\s\term{=}\s\nonterm{basicExpression}\s]\\ - \defterm{functionDefinition} & : & [\s\term{public}\s]\s\term{fun}\s\token{LIDENT}\s\term{(}\s\nonterm{functionArguments}\s\term{)}\s\nonterm{functionBody}\\ - \defterm{functionArguments} & : & [\s\token{LIDENT}\s(\s\term{,}\s\token{LIDENT}\s)^\star\s]\\ - \defterm{functionBody} & : & \term{\{}\s\nonterm{scopeExpression}\s\term{\}} - \end{array} - \] - \caption{Scope expression concrete syntax} -\end{figure} +In the description below we will take an in-line code samples in blockquotes "..." which are not considered as a +part of concrete syntax. +\input{03.01.lexical_structure} +\input{03.02.compilation_units} +\input{03.03.scope_expressions} +\input{03.04.expressions} \begin{figure}[t] \[ diff --git a/doc/spec/spec.bib b/doc/spec/spec.bib new file mode 100644 index 000000000..6bf414b15 --- /dev/null +++ b/doc/spec/spec.bib @@ -0,0 +1,6 @@ +@manual{GNULib, + title = "{The GNU Portability Library}", + organization = "{Free Software Foundation}", + bibdate = "February 24, 2019", + bibsource = "https://www.gnu.org/software/gnulib/manual" +} diff --git a/doc/spec/spec.tex b/doc/spec/spec.tex index 299e3f6ca..7a5d4a70f 100644 --- a/doc/spec/spec.tex +++ b/doc/spec/spec.tex @@ -86,30 +86,32 @@ \newcommand{\alt}{\s\mid\s} \newcommand{\s}{\:\:} -\lstdefinelanguage{lama}{ -keywords={fun, case, esac, do, od, if, then, else, elif, fi, skip, repeat, until, for, local}, +\lstdefinelanguage{alm}{ +keywords={skip,if,then,else,elif,fi,while,do,od,repeat,until,for,fun,local,public,return,import,length, +string,case,of,esac,when,boxed,unboxed,string,sexp,array,infix,infixl,infixr,at,before,after,true,false}, sensitive=true, -%basicstyle=\small, -commentstyle=\scriptsize\rmfamily, +basicstyle=\small, +%commentstyle=\scriptsize\rmfamily, keywordstyle=\ttfamily\bfseries, identifierstyle=\ttfamily, basewidth={0.5em,0.5em}, columns=fixed, fontadjust=true, literate={->}{{$\to$}}3, -morecomment=[s]{(*}{*)} +morecomment=[s][\ttfamily]{(*}{*)}, +morecomment=[l][\ttfamily]{--} } \lstset{ mathescape=true, -%basicstyle=\small, +basicstyle=\small, identifierstyle=\ttfamily, keywordstyle=\bfseries, commentstyle=\scriptsize\rmfamily, basewidth={0.5em,0.5em}, fontadjust=true, escapechar=!, -language=lama +language=alm } \sloppy