Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@
- `append`, `concat`, and `pop` can be used as values
- new `ptr` command for the debugger, printing the VM pointers (ip, pp, sp)
- compile time arity check when performing a tail call
- `string:utf8len` to compute the number of codepoints in a string

### Changed
- all paths inside `if` should return a value, when used as an expression. If an `else` branch is missing, `nil` will be returned
- new compile time error when trying to use `append!`, `concat!`, `pop!`, `@=` and `@@=` as values
- arguments in tail calls are loaded by value and not by reference
- `string:ord` checks that it get only 1 utf8 character

### Removed

Expand Down
1 change: 1 addition & 0 deletions include/Ark/Builtins/Builtins.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ namespace Ark::internal::Builtins
ARK_BUILTIN(format);
ARK_BUILTIN(findSubStr);
ARK_BUILTIN(removeAtStr);
ARK_BUILTIN(utf8len);
ARK_BUILTIN(ord);
ARK_BUILTIN(chr);
ARK_BUILTIN(setStringAt);
Expand Down
81 changes: 52 additions & 29 deletions include/utf8.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ namespace utf8
/**
* @brief Check the validity of a given string in UTF8
* @param str
* @return true if the given string is a valid UTF88 string
* @return true if the given string is a valid UTF8 string
*/
inline bool isValid(const char* str)
{
Expand Down Expand Up @@ -183,46 +183,69 @@ namespace utf8
return true;
}

inline std::size_t length(const char* str)
{
std::size_t count = 0;
const char* s = str;

if (str == nullptr)
return 0;

while (*s != 0)
{
if (0xf0 == (0xf8 & *s))
{
++count;
s += 4;
}
else if (0xe0 == (0xf0 & *s))
{
++count;
s += 3;
}
else if (0xc0 == (0xe0 & *s))
{
++count;
s += 2;
}
else if (0x00 == (0x80 & *s))
{
++count;
s += 1;
}
else
break;
}

return count;
}

/**
* @brief Compute the UTF8 codepoint for a given UTF8 char
* @param str
* @return UTF8 codepoint if valid, -1 otherwise
*/
inline int32_t codepoint(const char* str)
{
int32_t codepoint = 0;
const char* s = str;

if (isValid(str))
{
while (*s != 0)
{
if (0xf0 == (0xf8 & *s))
{
codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) | ((0x3f & s[2]) << 6) | (0x3f & s[3]);
s += 4;
}
else if (0xe0 == (0xf0 & *s))
{
codepoint = ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
s += 3;
}
else if (0xc0 == (0xe0 & *s))
{
codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
s += 2;
}
else if (0x00 == (0x80 & *s))
{
codepoint = s[0];
++s;
}
else
return -1;
}
}
int32_t c = 0;

return codepoint;
if (0xf0 == (0xf8 & *s))
c = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) | ((0x3f & s[2]) << 6) | (0x3f & s[3]);
else if (0xe0 == (0xf0 & *s))
c = ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
else if (0xc0 == (0xe0 & *s))
c = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
else if (0x00 == (0x80 & *s))
c = s[0];
else
return -1;
return c;
}
return -1;
}

/**
Expand Down
2 changes: 1 addition & 1 deletion lib/std
Submodule std updated 2 files
+9 −0 String.ark
+8 −0 tests/string-tests.ark
1 change: 1 addition & 0 deletions src/arkreactor/Builtins/Builtins.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ namespace Ark::internal::Builtins
{ "format", Value(String::format) },
{ "builtin__string:find", Value(String::findSubStr) },
{ "builtin__string:removeAt", Value(String::removeAtStr) },
{ "builtin__string:utf8len", Value(String::utf8len) },
{ "builtin__string:ord", Value(String::ord) },
{ "builtin__string:chr", Value(String::chr) },
{ "builtin__string:setAt", Value(String::setStringAt) },
Expand Down
20 changes: 19 additions & 1 deletion src/arkreactor/Builtins/String.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,18 @@ namespace Ark::internal::Builtins::String
throw std::runtime_error(fmt::format("string:removeAt: index {} out of range (length: {})", num, n[0].stringRef().size()));
}

Value utf8len(std::vector<Value>& n, VM* vm [[maybe_unused]])
{
if (!types::check(n, ValueType::String))
throw types::TypeCheckingError(
"string:utf8len",
{ { types::Contract { { types::Typedef("string", ValueType::String) } } } },
n);

const std::size_t len = utf8::length(n[0].stringRef().c_str());
return Value(static_cast<double>(len));
}

Value ord(std::vector<Value>& n, VM* vm [[maybe_unused]])
{
if (!types::check(n, ValueType::String))
Expand All @@ -296,7 +308,13 @@ namespace Ark::internal::Builtins::String
{ { types::Contract { { types::Typedef("string", ValueType::String) } } } },
n);

return Value(utf8::codepoint(n[0].stringRef().c_str()));
if (const std::size_t len = utf8::length(n[0].stringRef().c_str()); len != 1)
throw std::runtime_error(fmt::format("string:ord: invalid string '{}', expected a single character, got {}", n[0].string(), len));

const int32_t codepoint = utf8::codepoint(n[0].stringRef().c_str());
if (codepoint == -1)
throw std::runtime_error(fmt::format("string:ord: invalid string '{}'", n[0].string()));
return Value(codepoint);
}

// cppcheck-suppress constParameterReference
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
page_0
PUSH_RETURN_ADDRESS L0
BUILTIN 69
BUILTIN 70
BUILTIN 71
BUILTIN 72
Expand All @@ -25,6 +24,7 @@ page_0
BUILTIN 91
BUILTIN 92
BUILTIN 93
BUILTIN 94
CALL_BUILTIN 9, 25
.L0:
POP 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ page_0
HALT 0

page_1
CALL_BUILTIN_WITHOUT_RETURN_ADDRESS 55, 1
CALL_BUILTIN_WITHOUT_RETURN_ADDRESS 56, 1
.L0:
RET 0
HALT 0
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(print (builtin__string:ord ""))
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
string:ord: invalid string '', expected a single character, got 0

In file tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_empty_str.ark:1
1 | (print (builtin__string:ord ""))
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2 |
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(print (builtin__string:ord "abc"))
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
string:ord: invalid string 'abc', expected a single character, got 3

In file tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_str_too_long.ark:1
1 | (print (builtin__string:ord "abc"))
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2 |
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(print (builtin__string:utf8len 1 2))
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Function string:utf8len expected 1 argument but got 2
Call
↳ (string:utf8len 1 2)
Signature
↳ (string:utf8len string)
Arguments
→ `string' (expected String), got 1 (Number)
→ unexpected additional args: 2 (Number)

In file tests/unittests/resources/DiagnosticsSuite/typeChecking/utf8len_num_num.ark:1
1 | (print (builtin__string:utf8len 1 2))
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2 |
Loading