Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Couch stats resource tracker v2 #5213

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/chttpd/src/chttpd.erl
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,9 @@ handle_request_int(MochiReq) ->
% Save client socket so that it can be monitored for disconnects
chttpd_util:mochiweb_client_req_set(MochiReq),

%% This is probably better in before_request, but having Path is nice
couch_stats_resource_tracker:create_coordinator_context(HttpReq0, Path),

{HttpReq2, Response} =
case before_request(HttpReq0) of
{ok, HttpReq1} ->
Expand Down Expand Up @@ -372,6 +375,7 @@ after_request(HttpReq, HttpResp0) ->
HttpResp2 = update_stats(HttpReq, HttpResp1),
chttpd_stats:report(HttpReq, HttpResp2),
maybe_log(HttpReq, HttpResp2),
couch_stats_resource_tracker:destroy_context(),
HttpResp2.

process_request(#httpd{mochi_req = MochiReq} = HttpReq) ->
Expand Down Expand Up @@ -409,10 +413,12 @@ handle_req_after_auth(HandlerKey, HttpReq) ->
HandlerKey,
fun chttpd_db:handle_request/1
),
couch_stats_resource_tracker:set_context_handler_fun(HandlerFun),
AuthorizedReq = chttpd_auth:authorize(
possibly_hack(HttpReq),
fun chttpd_auth_request:authorize_request/1
),
couch_stats_resource_tracker:set_context_username(AuthorizedReq),
{AuthorizedReq, HandlerFun(AuthorizedReq)}
catch
ErrorType:Error:Stack ->
Expand Down
2 changes: 2 additions & 0 deletions src/chttpd/src/chttpd_db.erl
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@

% Database request handlers
handle_request(#httpd{path_parts = [DbName | RestParts], method = Method} = Req) ->
couch_stats_resource_tracker:set_context_dbname(DbName),
case {Method, RestParts} of
{'PUT', []} ->
create_db_req(Req, DbName);
Expand All @@ -103,6 +104,7 @@ handle_request(#httpd{path_parts = [DbName | RestParts], method = Method} = Req)
do_db_req(Req, fun db_req/2);
{_, [SecondPart | _]} ->
Handler = chttpd_handlers:db_handler(SecondPart, fun db_req/2),
couch_stats_resource_tracker:set_context_handler_fun(Handler),
do_db_req(Req, Handler)
end.

Expand Down
1 change: 1 addition & 0 deletions src/chttpd/src/chttpd_httpd_handlers.erl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ url_handler(<<"_utils">>) -> fun chttpd_misc:handle_utils_dir_req/1;
url_handler(<<"_all_dbs">>) -> fun chttpd_misc:handle_all_dbs_req/1;
url_handler(<<"_dbs_info">>) -> fun chttpd_misc:handle_dbs_info_req/1;
url_handler(<<"_active_tasks">>) -> fun chttpd_misc:handle_task_status_req/1;
url_handler(<<"_active_resources">>) -> fun chttpd_misc:handle_resource_status_req/1;
url_handler(<<"_scheduler">>) -> fun couch_replicator_httpd:handle_scheduler_req/1;
url_handler(<<"_node">>) -> fun chttpd_node:handle_node_req/1;
url_handler(<<"_reload_query_servers">>) -> fun chttpd_misc:handle_reload_query_servers_req/1;
Expand Down
105 changes: 105 additions & 0 deletions src/chttpd/src/chttpd_misc.erl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
handle_replicate_req/1,
handle_reload_query_servers_req/1,
handle_task_status_req/1,
handle_resource_status_req/1,
handle_up_req/1,
handle_utils_dir_req/1,
handle_utils_dir_req/2,
Expand Down Expand Up @@ -224,6 +225,110 @@ handle_task_status_req(#httpd{method = 'GET'} = Req) ->
handle_task_status_req(Req) ->
send_method_not_allowed(Req, "GET,HEAD").

handle_resource_status_req(#httpd{method = 'POST'} = Req) ->
ok = chttpd:verify_is_server_admin(Req),
chttpd:validate_ctype(Req, "application/json"),
{Props} = chttpd:json_body_obj(Req),
Action = proplists:get_value(<<"action">>, Props),
Key = proplists:get_value(<<"key">>, Props),
Val = proplists:get_value(<<"val">>, Props),
Comment on lines +228 to +234
Copy link
Contributor

@nickva nickva Sep 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're introducing a new API add some docs or some description (examples) how it works, and the intent behind it. What would CouchDB users use it for, how it is different than metrics, and active tasks, etc...


CountBy = fun couch_stats_resource_tracker:count_by/1,
GroupBy = fun couch_stats_resource_tracker:group_by/2,
SortedBy1 = fun couch_stats_resource_tracker:sorted_by/1,
SortedBy2 = fun couch_stats_resource_tracker:sorted_by/2,
ConvertEle = fun(K) -> list_to_existing_atom(binary_to_list(K)) end,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ConvertList = fun(L) -> [ConvertEle(E) || E <- L] end,
ToJson = fun couch_stats_resource_tracker:term_to_flat_json/1,
JsonKeys = fun(PL) -> [[ToJson(K), V] || {K, V} <- PL] end,

Fun = case {Action, Key, Val} of
{<<"count_by">>, Keys, undefined} when is_list(Keys) ->
Keys1 = [ConvertEle(K) || K <- Keys],
fun() -> CountBy(Keys1) end;
{<<"count_by">>, Key, undefined} ->
Key1 = ConvertEle(Key),
fun() -> CountBy(Key1) end;
{<<"group_by">>, Keys, Vals} when is_list(Keys) andalso is_list(Vals) ->
Keys1 = ConvertList(Keys),
Vals1 = ConvertList(Vals),
fun() -> GroupBy(Keys1, Vals1) end;
{<<"group_by">>, Key, Vals} when is_list(Vals) ->
Key1 = ConvertEle(Key),
Vals1 = ConvertList(Vals),
fun() -> GroupBy(Key1, Vals1) end;
{<<"group_by">>, Keys, Val} when is_list(Keys) ->
Keys1 = ConvertList(Keys),
Val1 = ConvertEle(Val),
fun() -> GroupBy(Keys1, Val1) end;
{<<"group_by">>, Key, Val} ->
Key1 = ConvertEle(Key),
Val1 = ConvertList(Val),
fun() -> GroupBy(Key1, Val1) end;

{<<"sorted_by">>, Key, undefined} ->
Key1 = ConvertEle(Key),
fun() -> JsonKeys(SortedBy1(Key1)) end;
{<<"sorted_by">>, Keys, undefined} when is_list(Keys) ->
Keys1 = [ConvertEle(K) || K <- Keys],
fun() -> JsonKeys(SortedBy1(Keys1)) end;
{<<"sorted_by">>, Keys, Vals} when is_list(Keys) andalso is_list(Vals) ->
Keys1 = ConvertList(Keys),
Vals1 = ConvertList(Vals),
fun() -> JsonKeys(SortedBy2(Keys1, Vals1)) end;
{<<"sorted_by">>, Key, Vals} when is_list(Vals) ->
Key1 = ConvertEle(Key),
Vals1 = ConvertList(Vals),
fun() -> JsonKeys(SortedBy2(Key1, Vals1)) end;
{<<"sorted_by">>, Keys, Val} when is_list(Keys) ->
Keys1 = ConvertList(Keys),
Val1 = ConvertEle(Val),
fun() -> JsonKeys(SortedBy2(Keys1, Val1)) end;
{<<"sorted_by">>, Key, Val} ->
Key1 = ConvertEle(Key),
Val1 = ConvertList(Val),
fun() -> JsonKeys(SortedBy2(Key1, Val1)) end;
_ ->
throw({badrequest, invalid_resource_request})
end,

Fun1 = fun() ->
case Fun() of
Map when is_map(Map) ->
{maps:fold(
fun
(_K,0,A) -> A; %% TODO: Skip 0 value entries?
(K,V,A) -> [{ToJson(K), V} | A]
end,
[], Map)};
List when is_list(List) ->
List
end
end,

{Resp, _Bad} = rpc:multicall(erlang, apply, [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fun() ->
{node(), Fun1()}
end,
[]
]),
%%io:format("{CSRT}***** GOT RESP: ~p~n", [Resp]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left-over debug comment

send_json(Req, {Resp});
handle_resource_status_req(#httpd{method = 'GET'} = Req) ->
ok = chttpd:verify_is_server_admin(Req),
{Resp, Bad} = rpc:multicall(erlang, apply, [
fun() ->
{node(), couch_stats_resource_tracker:active()}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of having to run in a closure just to get node() in the response, consider using an explicit list of nodes [node() | nodes()] with a plain [M, F, A] and then zipping over the nodes in the response. It's a bit longer but it's less fragile.

end,
[]
]),
%% TODO: incorporate Bad responses
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider using erpc, the handling of bad responses is a bit cleaner there, I think

send_json(Req, {Resp});
handle_resource_status_req(Req) ->
ok = chttpd:verify_is_server_admin(Req),
send_method_not_allowed(Req, "GET,HEAD,POST").


handle_replicate_req(#httpd{method = 'POST', user_ctx = Ctx, req_body = PostBody} = Req) ->
chttpd:validate_ctype(Req, "application/json"),
%% see HACK in chttpd.erl about replication
Expand Down
7 changes: 4 additions & 3 deletions src/chttpd/test/eunit/chttpd_db_doc_size_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@

setup() ->
Hashed = couch_passwords:hash_admin_password(?PASS),
ok = config:set("admins", ?USER, ?b2l(Hashed), _Persist = false),
ok = config:set("couchdb", "max_document_size", "50"),
ok = config:set("admins", ?USER, ?b2l(Hashed), false),
ok = config:set("couchdb", "max_document_size", "50", false),

TmpDb = ?tempdb(),
Addr = config:get("chttpd", "bind_address", "127.0.0.1"),
Port = mochiweb_socket_server:get(chttpd, port),
Expand All @@ -35,7 +36,7 @@ setup() ->

teardown(Url) ->
delete_db(Url),
ok = config:delete("admins", ?USER, _Persist = false),
ok = config:delete("admins", ?USER, false),
ok = config:delete("couchdb", "max_document_size").

create_db(Url) ->
Expand Down
2 changes: 2 additions & 0 deletions src/couch/include/couch_db.hrl
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
-define(INTERACTIVE_EDIT, interactive_edit).
-define(REPLICATED_CHANGES, replicated_changes).

-define(LOG_UNEXPECTED_MSG(Msg), couch_log:warning("[~p:~p:~p/~p]{~p[~p]} Unexpected message: ~w", [?MODULE, ?LINE, ?FUNCTION_NAME, ?FUNCTION_ARITY, self(), element(2, process_info(self(), message_queue_len)), Msg])).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am surprised linter doesn't complain about such long line.


-type branch() :: {Key::term(), Value::term(), Tree::term()}.
-type path() :: {Start::pos_integer(), branch()}.
-type update_type() :: replicated_changes | interactive_edit.
Expand Down
41 changes: 41 additions & 0 deletions src/couch/priv/stats_descriptions.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,10 @@
{type, counter},
{desc, <<"number of couch_server LRU operations skipped">>}
]}.
{[couchdb, couch_server, open], [
{type, counter},
{desc, <<"number of couch_server open operations invoked">>}
]}.
{[couchdb, query_server, vdu_rejects], [
{type, counter},
{desc, <<"number of rejections by validate_doc_update function">>}
Expand Down Expand Up @@ -418,10 +422,47 @@
{type, counter},
{desc, <<"number of other requests">>}
]}.
{[couchdb, query_server, js_filter], [
{type, counter},
{desc, <<"number of JS filter invocations">>}
]}.
Comment on lines +425 to +428
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is already a .query_server.calls.ddoc_filter counter

{[couchdb, query_server, js_filtered_docs], [
{type, counter},
{desc, <<"number of docs filtered through JS invocations">>}
]}.
{[couchdb, query_server, js_filter_error], [
{type, counter},
{desc, <<"number of JS filter invocation errors">>}
]}.
{[couchdb, legacy_checksums], [
{type, counter},
{desc, <<"number of legacy checksums found in couch_file instances">>}
]}.
{[couchdb, btree, folds], [
{type, counter},
{desc, <<"number of couch btree kv fold callback invocations">>}
]}.
{[couchdb, btree, get_node, kp_node], [
{type, counter},
{desc, <<"number of couch btree kp_nodes read">>}
]}.
{[couchdb, btree, get_node, kv_node], [
{type, counter},
{desc, <<"number of couch btree kv_nodes read">>}
]}.
Comment on lines +445 to +452
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wonder if this is a bit too low level? Or rather, btrees are used for view and db files. It's not clear which btrees the metrics refer to.

{[couchdb, btree, write_node, kp_node], [
{type, counter},
{desc, <<"number of couch btree kp_nodes written">>}
]}.
{[couchdb, btree, write_node, kv_node], [
{type, counter},
{desc, <<"number of couch btree kv_nodes written">>}
]}.
%% CSRT (couch_stats_resource_tracker) stats
{[couchdb, csrt, delta_missing_t0], [
{type, counter},
{desc, <<"number of csrt contexts without a proper startime">>}
]}.
{[pread, exceed_eof], [
{type, counter},
{desc, <<"number of the attempts to read beyond end of db file">>}
Expand Down
3 changes: 3 additions & 0 deletions src/couch/src/couch_btree.erl
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,8 @@ reduce_tree_size(kp_node, NodeSize, [{_K, {_P, _Red, Sz}} | NodeList]) ->

get_node(#btree{fd = Fd}, NodePos) ->
{ok, {NodeType, NodeList}} = couch_file:pread_term(Fd, NodePos),
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, btree, get_node, NodeType]),
{NodeType, NodeList}.

write_node(#btree{fd = Fd, compression = Comp} = Bt, NodeType, NodeList) ->
Expand All @@ -480,6 +482,7 @@ write_node(#btree{fd = Fd, compression = Comp} = Bt, NodeType, NodeList) ->
% now write out each chunk and return the KeyPointer pairs for those nodes
ToWrite = [{NodeType, Chunk} || Chunk <- Chunks],
WriteOpts = [{compression, Comp}],
couch_stats:increment_counter([couchdb, btree, write_node, NodeType]),
{ok, PtrSizes} = couch_file:append_terms(Fd, ToWrite, WriteOpts),
{ok, group_kps(Bt, NodeType, Chunks, PtrSizes)}.

Expand Down
2 changes: 2 additions & 0 deletions src/couch/src/couch_db.erl
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ open_doc(Db, IdOrDocInfo) ->
open_doc(Db, IdOrDocInfo, []).

open_doc(Db, Id, Options) ->
%% TODO: wire in csrt tracking
increment_stat(Db, [couchdb, database_reads]),
case open_doc_int(Db, Id, Options) of
{ok, #doc{deleted = true} = Doc} ->
Expand Down Expand Up @@ -1982,6 +1983,7 @@ increment_stat(#db{options = Options}, Stat, Count) when
->
case lists:member(sys_db, Options) of
true ->
%% TODO: we shouldn't leak resource usage just because it's a sys_db
ok;
false ->
couch_stats:increment_counter(Stat, Count)
Expand Down
8 changes: 8 additions & 0 deletions src/couch/src/couch_query_servers.erl
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,8 @@ filter_docs(Req, Db, DDoc, FName, Docs) ->
{ok, filter_docs_int(Db, DDoc, FName, JsonReq, JsonDocs)}
catch
throw:{os_process_error, {exit_status, 1}} ->
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, query_server, js_filter_error]),
Comment on lines +545 to +546
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have a general counter for os process errors, normal exits and error exits. Probably don't need an explicit one just for filters?

%% batch used too much memory, retry sequentially.
Fun = fun(JsonDoc) ->
filter_docs_int(Db, DDoc, FName, JsonReq, [JsonDoc])
Expand All @@ -550,6 +552,12 @@ filter_docs(Req, Db, DDoc, FName, Docs) ->
end.

filter_docs_int(Db, DDoc, FName, JsonReq, JsonDocs) ->
%% Count usage in _int version as this can be repeated for OS error
%% Pros & cons... might not have actually processed `length(JsonDocs)` docs
%% but it certainly undercounts if we count in `filter_docs/5` above
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, query_server, js_filter]),
couch_stats:increment_counter([couchdb, query_server, js_filtered_docs], length(JsonDocs)),
[true, Passes] = ddoc_prompt(
Db,
DDoc,
Expand Down
2 changes: 2 additions & 0 deletions src/couch/src/couch_server.erl
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ sup_start_link(N) ->
gen_server:start_link({local, couch_server(N)}, couch_server, [N], []).

open(DbName, Options) ->
%% TODO: wire in csrt tracking
couch_stats:increment_counter([couchdb, couch_server, open]),
try
validate_open_or_create(DbName, Options),
open_int(DbName, Options)
Expand Down
8 changes: 6 additions & 2 deletions src/couch_stats/src/couch_stats.app.src
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@
{application, couch_stats, [
{description, "Simple statistics collection"},
{vsn, git},
{registered, [couch_stats_aggregator, couch_stats_process_tracker]},
{applications, [kernel, stdlib]},
{registered, [
couch_stats_aggregator,
couch_stats_process_tracker,
couch_stats_resource_tracker
]},
{applications, [kernel, stdlib, couch_log]},
{mod, {couch_stats_app, []}},
{env, []}
]}.
30 changes: 30 additions & 0 deletions src/couch_stats/src/couch_stats.erl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@
update_gauge/2
]).

%% couch_stats_resource_tracker API
-export([
create_context/3,
maybe_track_rexi_init_p/1
]).

-type response() :: ok | {error, unknown_metric} | {error, invalid_metric}.
-type stat() :: {any(), [{atom(), any()}]}.

Expand All @@ -49,6 +55,11 @@ increment_counter(Name) ->

-spec increment_counter(any(), pos_integer()) -> response().
increment_counter(Name, Value) ->
%% Should maybe_track_local happen before or after notify?
%% If after, only currently tracked metrics declared in the app's
%% stats_description.cfg will be trackable locally. Pros/cons.
%io:format("NOTIFY_EXISTING_METRIC: ~p || ~p || ~p~n", [Name, Op, Type]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left-over debug statement

ok = maybe_track_local_counter(Name, Value),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is enabled for all requests, a bit worried about the performance here as we add an ets operation over a simple integer counter bump. Was there any noticeable performance impact from it, during perf runs?

case couch_stats_util:get_counter(Name, stats()) of
{ok, Ctx} -> couch_stats_counter:increment(Ctx, Value);
{error, Error} -> {error, Error}
Expand Down Expand Up @@ -100,6 +111,25 @@ stats() ->
now_sec() ->
erlang:monotonic_time(second).

%% Only potentially track positive increments to counters
-spec maybe_track_local_counter(any(), any()) -> ok.
maybe_track_local_counter(Name, Val) when is_integer(Val) andalso Val > 0 ->
%%io:format("maybe_track_local[~p]: ~p~n", [Val, Name]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

debug left-over?

couch_stats_resource_tracker:maybe_inc(Name, Val),
ok;
maybe_track_local_counter(_, _) ->
ok.

create_context(From, MFA, Nonce) ->
couch_stats_resource_tracker:create_context(From, MFA, Nonce).

maybe_track_rexi_init_p({M, F, _A}) ->
Metric = [M, F, spawned],
case couch_stats_resource_tracker:should_track(Metric) of
true -> increment_counter(Metric);
false -> ok
end.

-ifdef(TEST).

-include_lib("couch/include/couch_eunit.hrl").
Expand Down
Loading