#if defined(_MSC_VER) #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING #endif #include "ggml.h" #include "gguf.h" #include "wcommon.h" #include "wlog.h" #include "llama.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // // String utils // std::string string_format(const char * fmt, ...) { va_list ap; va_list ap2; va_start(ap, fmt); va_copy(ap2, ap); int size = vsnprintf(NULL, 0, fmt, ap); GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT std::vector buf(size + 1); int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); GGML_ASSERT(size2 == size); va_end(ap2); va_end(ap); return std::string(buf.data(), size); } std::string string_strip(const std::string & str) { size_t start = 0; size_t end = str.size(); while (start < end && std::isspace(str[start])) { start++; } while (end > start && std::isspace(str[end - 1])) { end--; } return str.substr(start, end - start); } std::string string_get_sortable_timestamp() { using clock = std::chrono::system_clock; const clock::time_point current_time = clock::now(); const time_t as_time_t = clock::to_time_t(current_time); char timestamp_no_ns[100]; std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t)); const int64_t ns = std::chrono::duration_cast( current_time.time_since_epoch() % 1000000000).count(); char timestamp_ns[11]; snprintf(timestamp_ns, 11, "%09" PRId64, ns); return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); } void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { if (search.empty()) { return; } std::string builder; builder.reserve(s.length()); size_t pos = 0; size_t last_pos = 0; while ((pos = s.find(search, last_pos)) != std::string::npos) { builder.append(s, last_pos, pos - last_pos); builder.append(replace); last_pos = pos + search.length(); } builder.append(s, last_pos, std::string::npos); s = std::move(builder); } std::string string_join(const std::vector & values, const std::string & separator) { std::ostringstream result; for (size_t i = 0; i < values.size(); ++i) { if (i > 0) { result << separator; } result << values[i]; } return result.str(); } std::vector string_split(const std::string & str, const std::string & delimiter) { std::vector parts; size_t start = 0; size_t end = str.find(delimiter); while (end != std::string::npos) { parts.push_back(str.substr(start, end - start)); start = end + delimiter.length(); end = str.find(delimiter, start); } parts.push_back(str.substr(start)); return parts; } std::string string_repeat(const std::string & str, size_t n) { if (n == 0) { return ""; } std::string result; result.reserve(str.length() * n); for (size_t i = 0; i < n; ++i) { result += str; } return result; } std::string string_from(bool value) { return value ? "true" : "false"; } std::string string_from(const std::vector & values) { std::stringstream buf; buf << "[ "; bool first = true; for (auto e : values) { if (first) { first = false; } else { buf << ", "; } buf << std::to_string(e); } buf << " ]"; return buf.str(); } std::string string_from(const struct llama_context * ctx, const std::vector & tokens) { std::stringstream buf; buf << "[ "; bool first = true; for (const auto & token : tokens) { if (!first) { buf << ", "; } else { first = false; } auto detokenized = wcommon_token_to_piece(ctx, token); detokenized.erase( std::remove_if( detokenized.begin(), detokenized.end(), [](const unsigned char c) { return !std::isprint(c); }), detokenized.end()); buf << "'" << detokenized << "'" << ":" << std::to_string(token); } buf << " ]"; return buf.str(); } std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) { std::stringstream buf; buf << "[ "; bool first = true; for (int i = 0; i < batch.n_tokens; ++i) { if (!first) { buf << ", "; } else { first = false; } auto detokenized = wcommon_token_to_piece(ctx, batch.token[i]); detokenized.erase( std::remove_if( detokenized.begin(), detokenized.end(), [](const unsigned char c) { return !std::isprint(c); }), detokenized.end()); buf << "\n" << std::to_string(i) << ", token '" << detokenized << "'" << ", pos " << std::to_string(batch.pos[i]) << ", n_seq_id " << std::to_string(batch.n_seq_id[i]) << ", seq_id " << std::to_string(batch.seq_id[i][0]) << ", logits " << std::to_string(batch.logits[i]); } buf << " ]"; return buf.str(); } void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) { if (input[input_idx] == '\\' && input_idx + 1 < input_len) { switch (input[++input_idx]) { case 'n': input[output_idx++] = '\n'; break; case 'r': input[output_idx++] = '\r'; break; case 't': input[output_idx++] = '\t'; break; case '\'': input[output_idx++] = '\''; break; case '\"': input[output_idx++] = '\"'; break; case '\\': input[output_idx++] = '\\'; break; case 'x': // Handle \x12, etc if (input_idx + 2 < input_len) { const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 }; char *err_p = nullptr; const long val = std::strtol(x, &err_p, 16); if (err_p == x + 2) { input_idx += 2; input[output_idx++] = char(val); break; } } // fall through default: input[output_idx++] = '\\'; input[output_idx++] = input[input_idx]; break; } } else { input[output_idx++] = input[input_idx]; } } input.resize(output_idx); } bool string_parse_kv_override(const char * data, std::vector & overrides) { const char * sep = strchr(data, '='); if (sep == nullptr || sep - data >= 128) { LOG_ERR("%s: malformed KV override '%s'\n", __func__, data); return false; } llama_model_kv_override kvo; std::strncpy(kvo.key, data, sep - data); kvo.key[sep - data] = 0; sep++; if (strncmp(sep, "int:", 4) == 0) { sep += 4; kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; kvo.val_i64 = std::atol(sep); } else if (strncmp(sep, "float:", 6) == 0) { sep += 6; kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; kvo.val_f64 = std::atof(sep); } else if (strncmp(sep, "bool:", 5) == 0) { sep += 5; kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; if (std::strcmp(sep, "true") == 0) { kvo.val_bool = true; } else if (std::strcmp(sep, "false") == 0) { kvo.val_bool = false; } else { LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data); return false; } } else if (strncmp(sep, "str:", 4) == 0) { sep += 4; kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; if (strlen(sep) > 127) { LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); return false; } strncpy(kvo.val_str, sep, 127); kvo.val_str[127] = '\0'; } else { LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data); return false; } overrides.emplace_back(std::move(kvo)); return true; } // // Batch utils // void wcommon_batch_clear(struct llama_batch & batch) { batch.n_tokens = 0; } void wcommon_batch_add( struct llama_batch & batch, llama_token id, llama_pos pos, const std::vector & seq_ids, bool logits) { GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded"); batch.token [batch.n_tokens] = id; batch.pos [batch.n_tokens] = pos; batch.n_seq_id[batch.n_tokens] = seq_ids.size(); for (size_t i = 0; i < seq_ids.size(); ++i) { batch.seq_id[batch.n_tokens][i] = seq_ids[i]; } batch.logits [batch.n_tokens] = logits; batch.n_tokens++; } // // Token utils // size_t wcommon_lcp(const llama_tokens & a, const llama_tokens & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} return i; } size_t wcommon_lcs(const llama_tokens & a, const llama_tokens & b) { // check for empty sequences if (a.empty() || b.empty()) { return 0; } // get the lengths of the input sequences size_t a_len = a.size(); size_t b_len = b.size(); // initialize the maximum length of the longest common subsequence (LCS) size_t max_length = 0; // use two rows instead of a 2D matrix to optimize space std::vector prev_row(b_len + 1, 0); std::vector curr_row(b_len + 1, 0); // iterate through the elements of a for (size_t i = 1; i <= a_len; i++) { // iterate through the elements of b for (size_t j = 1; j <= b_len; j++) { // if elements at the current positions match if (a[i - 1] == b[j - 1]) { // if it's the first element of either sequences, set LCS length to 1 if (i == 1 || j == 1) { curr_row[j] = 1; } else { // increment LCS length by 1 compared to the previous element curr_row[j] = prev_row[j - 1] + 1; } // update max_length if necessary if (curr_row[j] > max_length) { max_length = curr_row[j]; } } else { // reset LCS length if elements don't match curr_row[j] = 0; } } // update the previous row for the next iteration prev_row = curr_row; } // return the maximum length of the LCS return max_length; } // // Vocab utils // std::vector wcommon_tokenize( const struct llama_context * ctx, const std::string & text, bool add_special, bool parse_special) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); return wcommon_tokenize(vocab, text, add_special, parse_special); } std::vector wcommon_tokenize( const struct llama_vocab * vocab, const std::string & text, bool add_special, bool parse_special) { // upper limit for the number of tokens int n_tokens = text.length() + 2 * add_special; std::vector result(n_tokens); n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); if (n_tokens < 0) { result.resize(-n_tokens); int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); } return result; } std::string wcommon_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); return wcommon_token_to_piece(vocab, token, special); } std::string wcommon_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) { std::string piece; piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); if (n_chars < 0) { piece.resize(-n_chars); int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); GGML_ASSERT(check == -n_chars); } else { piece.resize(n_chars); } return piece; } std::string wcommon_detokenize(const struct llama_context * ctx, const std::vector & tokens, bool special) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); return wcommon_detokenize(vocab, tokens, special); } std::string wcommon_detokenize(const struct llama_vocab * vocab, const std::vector & tokens, bool special) { std::string text; text.resize(std::max(text.capacity(), tokens.size())); int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); if (n_chars < 0) { text.resize(-n_chars); n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization } text.resize(n_chars); // NOTE: the original tokenizer decodes bytes after collecting the pieces. return text; } // // Chat template utils // // This is the old version without jinja support std::string wcommon_chat_apply_template(const struct llama_model *model, const std::string &tmpl, const std::vector &msgs, bool add_ass) { int alloc_size = 0; bool fallback = false; // indicate if we must fallback to default chatml std::vector chat; for (const auto &msg : msgs) { chat.push_back({msg.role.c_str(), msg.content.c_str()}); alloc_size += (msg.role.size() + msg.content.size()) * 1.25; } const char *ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model, nullptr) : tmpl.c_str(); std::vector buf(alloc_size); // run the first time to get the total output length int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); // error: chat template is not supported if (res < 0) { if (ptr_tmpl != nullptr) { throw std::runtime_error("this custom template is not supported"); } // If the built-in template is not supported, we default to chatml res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); fallback = true; } // if it turns out that our buffer is too small, we resize it if ((size_t)res > buf.size()) { buf.resize(res); res = llama_chat_apply_template( fallback ? "chatml" : ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); } std::string formatted_chat(buf.data(), res); return formatted_chat; } // // Embedding utils // void wcommon_embd_normalize(const float * inp, float * out, int n, int embd_norm) { double sum = 0.0; switch (embd_norm) { case -1: // no normalisation sum = 1.0; break; case 0: // max absolute for (int i = 0; i < n; i++) { if (sum < std::abs(inp[i])) { sum = std::abs(inp[i]); } } sum /= 32760.0; // make an int16 range break; case 2: // euclidean for (int i = 0; i < n; i++) { sum += inp[i] * inp[i]; } sum = std::sqrt(sum); break; default: // p-norm (euclidean is p-norm p=2) for (int i = 0; i < n; i++) { sum += std::pow(std::abs(inp[i]), embd_norm); } sum = std::pow(sum, 1.0 / embd_norm); break; } const float norm = sum > 0.0 ? 1.0 / sum : 0.0f; for (int i = 0; i < n; i++) { out[i] = inp[i] * norm; } } float wcommon_embd_similarity_cos(const float * embd1, const float * embd2, int n){ double sum = 0.0; double sum1 = 0.0; double sum2 = 0.0; for (int i = 0; i < n; i++) { sum += embd1[i] * embd2[i]; sum1 += embd1[i] * embd1[i]; sum2 += embd2[i] * embd2[i]; } // Handle the case where one or both vectors are zero vectors if (sum1 == 0.0 || sum2 == 0.0) { if (sum1 == 0.0 && sum2 == 0.0) { return 1.0f; // two zero vectors are similar } return 0.0f; } return sum / (sqrt(sum1) * sqrt(sum2)); }