> The lexical grammars of Python and Haskell are not regular. What does that
> mean, and why aren’t they?

Regular languages can be parsed with an FSM, but Python and Haskell's grammars
more context than that to be determined.

> Aside from separating tokens—distinguishing `print foo` from
> `printfoo`—spaces aren’t used for much in most languages. However, in a
> couple of dark corners, a space does affect how code is parsed in
> CoffeeScript, Ruby, and the C preprocessor. Where and what effect does it
> have in each of those languages?

Spaces do matter for the closing of heredocs in Ruby...

> Our scanner here, like most, discards comments and whitespace since those
> aren’t needed by the parser. Why might you want to write a scanner that does
> not discard those? What would it be useful for?

Formatting or documentation generation?

FossilOrigin-Name: 5738a38b8f15d1aca61212966ff990de59a03502f183a7400bf4f2ea14ae5af6
private
alpha 2 years ago
parent 6e484507d0
commit f98866b04e

@ -1,5 +1,7 @@
#!/usr/bin/env ruby -w
require "strscan"
module Lox
class Error < StandardError
def initialize(line:, where: "", message:)
@ -32,7 +34,7 @@ module Lox
end
def self.run(src)
Runner.new(src).run
Runner.new.run(src)
end
def self.error(line, msg)
@ -40,17 +42,118 @@ module Lox
end
class Runner
def initialize(scanner:)
def initialize(scanner: Scanner.new)
@scanner = scanner
end
def run(src)
@scanner.scan(src)
@scanner.scan(src).each do |token|
puts token
end
end
end
class Scanner
KEYWORDS = {
and: :AND,
class: :CLASS,
else: :ELSE,
false: :FALSE,
for: :FOR,
fun: :FUN,
if: :IF,
nil: :NIL,
or: :OR,
print: :PRINT,
return: :RETURN,
super: :SUPER,
this: :THIS,
true: :TRUE,
var: :VAR,
while: :WHILE,
}.transform_keys(&:to_s)
State = Struct.new(:ss, :tokens, :errors, :line) do
def eos? = ss.eos?
def scan(re) = ss.scan(re)
def pos = ss.pos
def add_token(type, text: nil, literal: nil)
text ||= ss.matched
self.tokens << Token.new(type, text, literal, line)
end
end
def scan(src)
state = State.new(StringScanner.new(src), [], [], 1)
until state.eos?
case
when state.scan(/\(/) then state.add_token(:LEFT_PAREN)
when state.scan(/\)/) then state.add_token(:RIGHT_PAREN)
when state.scan(/\{/) then state.add_token(:LEFT_BRACE)
when state.scan(/}/) then state.add_token(:RIGHT_BRACE)
when state.scan(/,/) then state.add_token(:COMMA)
when state.scan(/\./) then state.add_token(:DOT)
when state.scan(/-/) then state.add_token(:MINUS)
when state.scan(/\+/) then state.add_token(:PLUS)
when state.scan(/;/) then state.add_token(:SEMICOLON)
when state.scan(/\*/) then state.add_token(:STAR)
when state.scan(/!=/) then state.add_token(:BANG_EQUAL)
when state.scan(/!/) then state.add_token(:BANG)
when state.scan(/==/) then state.add_token(:EQUAL_EQUAL)
when state.scan(/=/) then state.add_token(:EQUAL)
when state.scan(/<=/) then state.add_token(:LESS_EQUAL)
when state.scan(/</) then state.add_token(:LESS)
when state.scan(/>=/) then state.add_token(:GREATER_EQUAL)
when state.scan(/>/) then state.add_token(:GREATER)
when state.scan(/\/\/(?~\n)+/) # ignore comment
when state.scan(/\//) then state.add_token(:SLASH)
when state.scan(/[ \r\t]/) # ignore whitespace
when state.scan(/\n/) then state.line += 1
when state.scan(/"/)
scan_str(state)
when number = state.scan(/\d+(\.\d+)?/)
state.add_token(:NUMBER, literal: number.to_f)
when identifier = state.scan(/[a-zA-Z_]\w+/)
type = KEYWORDS.fetch(identifier, :IDENTIFIER)
state.add_token(type)
else
state.errors << Error.new(line: state.line, message: "Unexpected character.")
end
end
state.tokens
end
private
def scan_str(state)
text = ?"
loop do
case
when state.scan(/"/)
text << ?"
state.add_token(:STRING, text:, literal: text[1..-2])
return
when state.scan(/\n/)
text << ?\n
state.line += 1
when state.eos?
state.errors << Error.new(line: state.line, message: "Unterminated string.")
return
when c = state.scan(/./)
text << c
else
fail "unreachable!"
end
end
end
end
Token = Struct.new(:type, :lexeme, :literal, :line) do
def to_s
"#{type} #{lexeme} #{literal}"
end
end
end

@ -31,3 +31,72 @@ class TestRunner < Minitest::Test
assert_equal %w[ some tokens ], tokens
end
end
class TestScanner < Minitest::Test
def setup
@scanner = Lox::Scanner.new
end
def test_basic_tokens
%w[( LEFT_PAREN
) RIGHT_PAREN
{ LEFT_BRACE
} RIGHT_BRACE
, COMMA
. DOT
- MINUS
+ PLUS
; SEMICOLON
* STAR
!= BANG_EQUAL
! BANG
== EQUAL_EQUAL
= EQUAL
<= LESS_EQUAL
< LESS
>= GREATER_EQUAL
> GREATER
/ SLASH].each_slice(2).to_h.transform_values(&:to_sym).each do |str, token_type|
assert_equal [token_type.to_sym], @scanner.scan(str).map(&:type)
end
end
def test_comments_and_whitespace
tokens = @scanner.scan(<<~SRC)
(\t) // here lies a comment
.
SRC
assert_equal %i[LEFT_PAREN RIGHT_PAREN DOT], tokens.map(&:type)
end
def test_line_numbers
tokens = @scanner.scan(<<~SRC)
(
)
SRC
assert_equal [1, 2], tokens.map(&:line)
end
def test_strings
assert_equal [Lox::Token.new(:STRING, '""', "", 1)], @scanner.scan('""')
assert_equal [], @scanner.scan('"') # TODO test the error once it's exposed
assert_equal [Lox::Token.new(:STRING, '"foo"', "foo", 1)], @scanner.scan('"foo"')
assert_equal [Lox::Token.new(:STRING, "\"foo\nbar\"", "foo\nbar", 2)], @scanner.scan("\"foo\nbar\"")
end
def test_numbers
assert_equal [
Lox::Token.new(:NUMBER, "123", 123.0, 1),
Lox::Token.new(:NUMBER, "123.4", 123.4, 1),
], @scanner.scan("123 123.4")
end
def test_identifiers
assert_equal [
Lox::Token.new(:OR, "or", nil, 1),
Lox::Token.new(:IDENTIFIER, "orchid", nil, 1),
], @scanner.scan("or orchid")
end
end

Loading…
Cancel
Save