-
Notifications
You must be signed in to change notification settings - Fork 955
[POC] Key blocking for dirty writes #2865
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: unstable
Are you sure you want to change the base?
Changes from all commits
1568821
e8b5ded
1e383e0
7d1d0cb
ab6bc74
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| #ifndef DURABLE_WRITE_H | ||
| #define DURABLE_WRITE_H | ||
|
|
||
| #include <inttypes.h> | ||
| #include <sys/types.h> | ||
| #include <stdbool.h> | ||
| #include "expire.h" | ||
| #include "sds.h" | ||
|
|
||
| #define DURABLE_ACCESSED_DATA_UNAVAILABLE "Accessed data unavailable to be served" | ||
| /* Command filter codes that are used in pre execution stage of a command. */ | ||
| #define CMD_FILTER_ALLOW 0 | ||
| #define CMD_FILTER_REJECT 1 | ||
|
|
||
| struct client; | ||
| struct serverObject; | ||
| struct serverDb; | ||
| struct list; | ||
| struct listNode; | ||
|
|
||
| typedef long long mstime_t; | ||
|
|
||
| /** | ||
| * Durability container to house all the durability related fields. | ||
| */ | ||
| typedef struct durable_t { | ||
| /* Uncommitted keys cleanup configuration time limit in milliseconds */ | ||
| unsigned int keys_cleanup_time_limit_ms; | ||
| /* The current scanning database index, starting from 0 */ | ||
| int curr_db_scan_idx; | ||
|
|
||
| /* Number of replicas to ack for an update to be considered committed */ | ||
| long long num_replicas_to_ack; | ||
|
|
||
| /* clients waiting for offset ack/quorum*/ | ||
| struct list *clients_waiting_replica_ack; | ||
|
|
||
| // cached allocation of replica offsets to prevent allocation per cmd. | ||
| unsigned long replica_offsets_size; | ||
| long long *replica_offsets; | ||
| // Previously acknowledged replication offset by replicas | ||
| long long previous_acked_offset; | ||
| } durable_t; | ||
|
|
||
| // Blocked response structure used by client to mark | ||
| // the blocking information associated with each response | ||
| typedef struct blockedResponse { | ||
| // Pointer to the client's reply node where the blocked response starts. | ||
| // NULL if the blocked response starts from the 16KB initial buffer | ||
| // Here we don't take ownership of this pointer so we never | ||
| // release the memory pointed to by this block. | ||
| struct listNode *disallowed_reply_block; | ||
| // The boundary in the reply buffer where the blocked response starts. | ||
| // We don't write data from this point onwards to the client socket | ||
| size_t disallowed_byte_offset; | ||
| // The replication offset to wait for ACK from replicas | ||
| long long primary_repl_offset; | ||
| } blockedResponse; | ||
|
|
||
| // Describes a pre-execution COB offset for a client | ||
| typedef struct preExecutionOffsetPosition { | ||
| // True if the pre execution offset/reply block are initialized | ||
| bool recorded; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When would it be not recorded? |
||
| // Track initial client COB position for client blocking | ||
| // Pointer to the pre-execution reply node, NULL for initial buffer | ||
| struct listNode *reply_block; | ||
| // Byte position boundary within the pre-execution reply block | ||
| size_t byte_offset; | ||
| } preExecutionOffsetPosition; | ||
|
|
||
| typedef struct clientDurabilityInfo { | ||
| // Blocked client responses list for consistency | ||
| struct list *blocked_responses; | ||
|
|
||
| /* Pre-execution data recorded before a command is executed | ||
| * to record the boundaries of the COB. */ | ||
| preExecutionOffsetPosition offset; | ||
|
|
||
| // Replication offset to block this current command response | ||
| long long current_command_repl_offset; | ||
| } clientDurableInfo; | ||
|
|
||
| /** | ||
| * Init | ||
| */ | ||
| void durableInit(void); | ||
| void durableClientInit(struct client *c); | ||
| void durableClientReset(struct client *c); | ||
| /* | ||
| Command processing hooks for offset and cob tracking | ||
| */ | ||
| void preCall(void); | ||
| void postCall(struct client *c); | ||
| int preCommandExec(struct client *c); | ||
| void postCommandExec(struct client *c); | ||
| void postReplicaAck(void); | ||
|
|
||
| /* | ||
| Utils | ||
| */ | ||
| int isPrimaryDurabilityEnabled(void); | ||
| bool isClientReplyBufferLimited(struct client *c); | ||
| long long durablePurgeAndGetUncommittedKeyOffset(const sds key, struct serverDb *db); | ||
| // TODO: naming of these flags. | ||
| int isDurabilityEnabled(void); | ||
| void clearUncommittedKeysAcknowledged(void); | ||
| // TODO: | ||
| // preReplyToBlockedClient | ||
| // for streams and timeounts, when a blocked client is being unblocked | ||
| // before a reply is added, the command will not be reprocessed via processCommand() | ||
| // we should hook this to get pre-execution offsets | ||
|
|
||
|
|
||
|
|
||
|
|
||
| #endif /* DURABLE_WRITE_H */ | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ranshid Do we remove the read handler callback while the client is blocked? |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ | |
| */ | ||
| #include "server.h" | ||
| #include "connection.h" | ||
| #include "durable_write.h" | ||
| #include "monotonic.h" | ||
| #include "cluster.h" | ||
| #include "cluster_slot_stats.h" | ||
|
|
@@ -1694,6 +1695,7 @@ long long serverCron(struct aeEventLoop *eventLoop, long long id, void *clientDa | |
| run_with_period(100) modulesCron(); | ||
| } | ||
|
|
||
| run_with_period(1000) clearUncommittedKeysAcknowledged(); | ||
| /* Fire the cron loop modules event. */ | ||
| ValkeyModuleCronLoopV1 ei = {VALKEYMODULE_CRON_LOOP_VERSION, server.hz}; | ||
| moduleFireServerEvent(VALKEYMODULE_EVENT_CRON_LOOP, 0, &ei); | ||
|
|
@@ -2815,6 +2817,10 @@ serverDb *createDatabase(int id) { | |
| db->ready_keys = dictCreate(&objectKeyPointerValueDictType); | ||
| db->watched_keys = dictCreate(&keylistDictType); | ||
| db->id = id; | ||
|
|
||
| db->uncommitted_keys = raxNew(); | ||
| db->dirty_repl_offset = -1; | ||
| db->scan_in_progress = 0; | ||
| resetDbExpiryState(db); | ||
| return db; | ||
| } | ||
|
|
@@ -3035,7 +3041,7 @@ void initServer(void) { | |
| commandlogInit(); | ||
| latencyMonitorInit(); | ||
| initSharedQueryBuf(); | ||
|
|
||
| durableInit(); | ||
| /* Initialize ACL default password if it exists */ | ||
| ACLUpdateDefaultUserPassword(server.requirepass); | ||
|
|
||
|
|
@@ -3785,6 +3791,7 @@ void call(client *c, int flags) { | |
| struct ClientFlags client_old_flags = c->flag; | ||
|
|
||
| struct serverCommand *real_cmd = c->realcmd; | ||
| preCall(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe (Functions in some other areas use prefixes like |
||
| client *prev_client = server.executing_client; | ||
| server.executing_client = c; | ||
|
|
||
|
|
@@ -3982,7 +3989,9 @@ void call(client *c, int flags) { | |
| if (zmalloc_used > server.stat_peak_memory) server.stat_peak_memory = zmalloc_used; | ||
|
|
||
| /* Do some maintenance job and cleanup */ | ||
| // TODO: should blocking postCall could be moved into afterCommand? | ||
| afterCommand(c); | ||
| postCall(c); | ||
|
Comment on lines
3991
to
+3994
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The names "after command" and "post call" basically mean the same thing. Let's rename both of them to better explain what they do. For example something like |
||
|
|
||
| /* Remember the replication offset of the client, right after its last | ||
| * command that resulted in propagation. */ | ||
|
|
@@ -4516,8 +4525,12 @@ int processCommand(client *c) { | |
| queueMultiCommand(c, cmd_flags); | ||
| addReply(c, shared.queued); | ||
| } else { | ||
| if (preCommandExec(c) == CMD_FILTER_REJECT) { | ||
| return C_OK; | ||
| } | ||
| int flags = CMD_CALL_FULL; | ||
| call(c, flags); | ||
| postCommandExec(c); | ||
| if (listLength(server.ready_keys) && !isInsideYieldingLongCommand()) handleClientsBlockedOnKeys(); | ||
| } | ||
| return C_OK; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,6 +37,7 @@ | |
| #include "commands.h" | ||
| #include "allocator_defrag.h" | ||
|
|
||
| #include <stdint.h> | ||
| #include <stdio.h> | ||
| #include <stdlib.h> | ||
| #include <stddef.h> | ||
|
|
@@ -52,7 +53,7 @@ | |
| #include <netinet/in.h> | ||
| #include <sys/socket.h> | ||
| #include <signal.h> | ||
|
|
||
| #include "durable_write.h" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The other
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. gotcha! will do and keep it in mind for other revisions/prs |
||
| #ifdef HAVE_LIBSYSTEMD | ||
| #include <systemd/sd-daemon.h> | ||
| #endif | ||
|
|
@@ -868,6 +869,7 @@ typedef struct replBufBlock { | |
| char buf[]; | ||
| } replBufBlock; | ||
|
|
||
|
|
||
| /* Database representation. There are multiple databases identified | ||
| * by integers from 0 (the default database) up to the max configured | ||
| * database. The database number is the 'id' field in the structure. */ | ||
|
|
@@ -886,6 +888,13 @@ typedef struct serverDb { | |
| long long avg_ttl; /* Average TTL, just for stats */ | ||
| unsigned long cursor; /* Cursor of the active expire cycle. */ | ||
| } expiry[ACTIVE_EXPIRY_TYPE_COUNT]; | ||
|
|
||
| /* fields related to dirty key tracking | ||
| * for consistent writes with durability */ | ||
| rax *uncommitted_keys; /* Map of dirty keys to the offset required by replica acknowledgement */ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A RAX probably fine for the PoC. It's similar to the RAX that we used in the past to track the slot-to-key mapping. We replaced that one with one hashtable per slot instead. This RAX tracking is costly because it duplicates the key names and there are multiple pointers to follow. A hashtable is flat. In the long term, I think we should store the offset within the reference-counted key-value object itself (robj metadata of some sort). If we need a flag to mark it as dirty, we can steal one of the bits from the reference counter. To find all the dirty keys, we can use a hashtable, probably one per slot in cluster mode, so we can probably use a kvstore.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We had some discussion on this and would prefer to porting into hashtable as well. Thing to note, this isn't permanent data and is ephemeral in nature.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, so a middle way is to only add a bit flag in the robj so we can avoid looking up the RAX every time a non-dirty key is accessed. |
||
| long long dirty_repl_offset; /* Replication offset for a dirty DB */ | ||
| raxIterator next_scan_iter; /* The next iterator for db scan */ | ||
| int scan_in_progress; /* Flag of showing whether db is in scan or not */ | ||
| } serverDb; | ||
|
|
||
| /* forward declaration for functions ctx */ | ||
|
|
@@ -1168,6 +1177,8 @@ typedef struct ClientFlags { | |
| or client::buf. */ | ||
| uint64_t keyspace_notified : 1; /* Indicates that a keyspace notification was triggered during the execution of the | ||
| current command. */ | ||
| uint64_t durable_blocked_client: 1; /* This is a durable blocked client that is waiting for the server to | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this the 65th flag? It means this bit field grows from 64 bits to 128 and we get 63 unused bits in the end. I think it doesn't go well with the union in the client struct, which requires it to be 64 bits, so we can do things like clearing all flags by setting Perhaps we can consider adding a new, separate, set of flags, like
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, fair enough. Maybe a separate set of durability-related flags? (maybe it doesn't much help unless we plan to have several here) |
||
| * acknowledge the write of the command that caused it to be blocked. */ | ||
| } ClientFlags; | ||
|
|
||
| typedef struct ClientPubSubData { | ||
|
|
@@ -1366,6 +1377,7 @@ typedef struct client { | |
| #ifdef LOG_REQ_RES | ||
| clientReqResInfo reqres; | ||
| #endif | ||
| struct clientDurabilityInfo clientDurabilityInfo; | ||
| } client; | ||
|
|
||
| /* When a command generates a lot of discrete elements to the client output buffer, it is much faster to | ||
|
|
@@ -1665,6 +1677,7 @@ typedef enum childInfoType { | |
| } childInfoType; | ||
|
|
||
| struct valkeyServer { | ||
| durable_t durability; | ||
| /* General */ | ||
| pid_t pid; /* Main process pid. */ | ||
| pthread_t main_thread_id; /* Main thread id */ | ||
|
|
@@ -2927,6 +2940,9 @@ int processIOThreadsWriteDone(void); | |
| void releaseReplyReferences(client *c); | ||
| void resetLastWrittenBuf(client *c); | ||
|
|
||
| //TODO:jules move this elsewhere | ||
| int getIntFromObject(robj *o, int *target); | ||
|
|
||
| int parseExtendedCommandArgumentsOrReply(client *c, int *flags, int *unit, robj **expire, robj **compare_val, int command_type, int max_args); | ||
|
|
||
| /* logreqres.c - logging of requests and responses */ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What the code in this file does is to implement the WBL and block/unblock client replies. Durably replicated writes is more than just this. A name like
wbl.correplyblocking.cwould be more exact for this file.