@ -27,6 +27,7 @@
# include <errno.h>
# include <limits.h>
# include <netinet/in.h>
# include <netinet/tcp.h>
# include <stdbool.h>
# include <stdio.h>
# include <stdlib.h>
@ -34,6 +35,7 @@
# include <sys/socket.h>
# include <sys/types.h>
# include <unistd.h>
# include <algorithm>
# include <grpc/slice.h>
# include <grpc/support/alloc.h>
@ -54,6 +56,15 @@
# include "src/core/lib/slice/slice_internal.h"
# include "src/core/lib/slice/slice_string_helpers.h"
# ifndef SOL_TCP
# define SOL_TCP IPPROTO_TCP
# endif
# ifndef TCP_INQ
# define TCP_INQ 36
# define TCP_CM_INQ TCP_INQ
# endif
# ifdef GRPC_HAVE_MSG_NOSIGNAL
# define SENDMSG_FLAGS MSG_NOSIGNAL
# else
@ -88,8 +99,11 @@ struct grpc_tcp {
grpc_slice_buffer last_read_buffer ;
grpc_slice_buffer * incoming_buffer ;
int inq ; /* bytes pending on the socket from the last read. */
bool inq_capable ; /* cache whether kernel supports inq */
grpc_slice_buffer * outgoing_buffer ;
/** byte within outgoing_buffer->slices[0] to write next */
/* byte within outgoing_buffer->slices[0] to write next */
size_t outgoing_byte_idx ;
grpc_closure * read_cb ;
@ -429,69 +443,140 @@ static void tcp_do_read(grpc_tcp* tcp) {
GPR_TIMER_SCOPE ( " tcp_do_read " , 0 ) ;
struct msghdr msg ;
struct iovec iov [ MAX_READ_IOVEC ] ;
char cmsgbuf [ 24 /*CMSG_SPACE(sizeof(int))*/ ] ;
ssize_t read_bytes ;
size_t i ;
GPR_ASSERT ( tcp - > incoming_buffer - > count < = MAX_READ_IOVEC ) ;
size_t total_read_bytes = 0 ;
for ( i = 0 ; i < tcp - > incoming_buffer - > count ; i + + ) {
size_t iov_len =
std : : min < size_t > ( MAX_READ_IOVEC , tcp - > incoming_buffer - > count ) ;
for ( size_t i = 0 ; i < iov_len ; i + + ) {
iov [ i ] . iov_base = GRPC_SLICE_START_PTR ( tcp - > incoming_buffer - > slices [ i ] ) ;
iov [ i ] . iov_len = GRPC_SLICE_LENGTH ( tcp - > incoming_buffer - > slices [ i ] ) ;
}
msg . msg_name = nullptr ;
msg . msg_namelen = 0 ;
msg . msg_iov = iov ;
msg . msg_iovlen = static_cast < msg_iovlen_type > ( tcp - > incoming_buffer - > count ) ;
msg . msg_control = nullptr ;
msg . msg_controllen = 0 ;
msg . msg_flags = 0 ;
GRPC_STATS_INC_TCP_READ_OFFER ( tcp - > incoming_buffer - > length ) ;
GRPC_STATS_INC_TCP_READ_OFFER_IOV_SIZE ( tcp - > incoming_buffer - > count ) ;
do {
GPR_TIMER_SCOPE ( " recvmsg " , 0 ) ;
GRPC_STATS_INC_SYSCALL_READ ( ) ;
read_bytes = recvmsg ( tcp - > fd , & msg , 0 ) ;
} while ( read_bytes < 0 & & errno = = EINTR ) ;
if ( read_bytes < 0 ) {
/* NB: After calling call_read_cb a parallel call of the read handler may
* be running . */
if ( errno = = EAGAIN ) {
finish_estimate ( tcp ) ;
/* We've consumed the edge, request a new one */
notify_on_read ( tcp ) ;
/* Assume there is something on the queue. If we receive TCP_INQ from
* kernel , we will update this value , otherwise , we have to assume there is
* always something to read until we get EAGAIN . */
tcp - > inq = 1 ;
msg . msg_name = nullptr ;
msg . msg_namelen = 0 ;
msg . msg_iov = iov ;
msg . msg_iovlen = static_cast < msg_iovlen_type > ( iov_len ) ;
if ( tcp - > inq_capable ) {
msg . msg_control = cmsgbuf ;
msg . msg_controllen = sizeof ( cmsgbuf ) ;
} else {
msg . msg_control = nullptr ;
msg . msg_controllen = 0 ;
}
msg . msg_flags = 0 ;
GRPC_STATS_INC_TCP_READ_OFFER ( tcp - > incoming_buffer - > length ) ;
GRPC_STATS_INC_TCP_READ_OFFER_IOV_SIZE ( tcp - > incoming_buffer - > count ) ;
do {
GPR_TIMER_SCOPE ( " recvmsg " , 0 ) ;
GRPC_STATS_INC_SYSCALL_READ ( ) ;
read_bytes = recvmsg ( tcp - > fd , & msg , 0 ) ;
} while ( read_bytes < 0 & & errno = = EINTR ) ;
/* We have read something in previous reads. We need to deliver those
* bytes to the upper layer . */
if ( read_bytes < = 0 & & total_read_bytes > 0 ) {
tcp - > inq = 1 ;
break ;
}
if ( read_bytes < 0 ) {
/* NB: After calling call_read_cb a parallel call of the read handler may
* be running . */
if ( errno = = EAGAIN ) {
finish_estimate ( tcp ) ;
tcp - > inq = 0 ;
/* We've consumed the edge, request a new one */
notify_on_read ( tcp ) ;
} else {
grpc_slice_buffer_reset_and_unref_internal ( tcp - > incoming_buffer ) ;
call_read_cb ( tcp ,
tcp_annotate_error ( GRPC_OS_ERROR ( errno , " recvmsg " ) , tcp ) ) ;
TCP_UNREF ( tcp , " read " ) ;
}
return ;
}
if ( read_bytes = = 0 ) {
/* 0 read size ==> end of stream
*
* We may have read something , i . e . , total_read_bytes > 0 , but
* since the connection is closed we will drop the data here , because we
* can ' t call the callback multiple times . */
grpc_slice_buffer_reset_and_unref_internal ( tcp - > incoming_buffer ) ;
call_read_cb ( tcp ,
tcp_annotate_error ( GRPC_OS_ERROR ( errno , " recvmsg " ) , tcp ) ) ;
call_read_cb (
tcp , tcp_annotate_error (
GRPC_ERROR_CREATE_FROM_STATIC_STRING ( " Socket closed " ) , tcp ) ) ;
TCP_UNREF ( tcp , " read " ) ;
return ;
}
} else if ( read_bytes = = 0 ) {
/* 0 read size ==> end of stream */
grpc_slice_buffer_reset_and_unref_internal ( tcp - > incoming_buffer ) ;
call_read_cb (
tcp , tcp_annotate_error (
GRPC_ERROR_CREATE_FROM_STATIC_STRING ( " Socket closed " ) , tcp ) ) ;
TCP_UNREF ( tcp , " read " ) ;
} else {
GRPC_STATS_INC_TCP_READ_SIZE ( read_bytes ) ;
add_to_estimate ( tcp , static_cast < size_t > ( read_bytes ) ) ;
GPR_ASSERT ( ( size_t ) read_bytes < = tcp - > incoming_buffer - > length ) ;
if ( static_cast < size_t > ( read_bytes ) = = tcp - > incoming_buffer - > length ) {
finish_estimate ( tcp ) ;
} else if ( static_cast < size_t > ( read_bytes ) < tcp - > incoming_buffer - > length ) {
grpc_slice_buffer_trim_end (
tcp - > incoming_buffer ,
tcp - > incoming_buffer - > length - static_cast < size_t > ( read_bytes ) ,
& tcp - > last_read_buffer ) ;
GPR_DEBUG_ASSERT ( ( size_t ) read_bytes < =
tcp - > incoming_buffer - > length - total_read_bytes ) ;
# ifdef GRPC_HAVE_TCP_INQ
if ( tcp - > inq_capable ) {
GPR_DEBUG_ASSERT ( ! ( msg . msg_flags & MSG_CTRUNC ) ) ;
struct cmsghdr * cmsg = CMSG_FIRSTHDR ( & msg ) ;
for ( ; cmsg ! = nullptr ; cmsg = CMSG_NXTHDR ( & msg , cmsg ) ) {
if ( cmsg - > cmsg_level = = SOL_TCP & & cmsg - > cmsg_type = = TCP_CM_INQ & &
cmsg - > cmsg_len = = CMSG_LEN ( sizeof ( int ) ) ) {
tcp - > inq = * reinterpret_cast < int * > ( CMSG_DATA ( cmsg ) ) ;
}
}
}
GPR_ASSERT ( ( size_t ) read_bytes = = tcp - > incoming_buffer - > length ) ;
call_read_cb ( tcp , GRPC_ERROR_NONE ) ;
TCP_UNREF ( tcp , " read " ) ;
# endif /* GRPC_HAVE_TCP_INQ */
total_read_bytes + = read_bytes ;
if ( tcp - > inq = = 0 | | total_read_bytes = = tcp - > incoming_buffer - > length ) {
/* We have filled incoming_buffer, and we cannot read any more. */
break ;
}
/* We had a partial read, and still have space to read more data.
* So , adjust IOVs and try to read more . */
size_t remaining = read_bytes ;
size_t j = 0 ;
for ( size_t i = 0 ; i < iov_len ; i + + ) {
if ( remaining > = iov [ i ] . iov_len ) {
remaining - = iov [ i ] . iov_len ;
continue ;
}
if ( remaining > 0 ) {
iov [ j ] . iov_base = static_cast < char * > ( iov [ i ] . iov_base ) + remaining ;
iov [ j ] . iov_len = iov [ i ] . iov_len - remaining ;
remaining = 0 ;
} else {
iov [ j ] . iov_base = iov [ i ] . iov_base ;
iov [ j ] . iov_len = iov [ i ] . iov_len ;
}
+ + j ;
}
iov_len = j ;
} while ( true ) ;
if ( tcp - > inq = = 0 ) {
finish_estimate ( tcp ) ;
}
GPR_DEBUG_ASSERT ( total_read_bytes > 0 ) ;
if ( total_read_bytes < tcp - > incoming_buffer - > length ) {
grpc_slice_buffer_trim_end ( tcp - > incoming_buffer ,
tcp - > incoming_buffer - > length - total_read_bytes ,
& tcp - > last_read_buffer ) ;
}
call_read_cb ( tcp , GRPC_ERROR_NONE ) ;
TCP_UNREF ( tcp , " read " ) ;
}
static void tcp_read_allocation_done ( void * tcpp , grpc_error * error ) {
@ -512,7 +597,8 @@ static void tcp_read_allocation_done(void* tcpp, grpc_error* error) {
static void tcp_continue_read ( grpc_tcp * tcp ) {
size_t target_read_size = get_target_read_size ( tcp ) ;
if ( tcp - > incoming_buffer - > length < target_read_size / 2 & &
/* Wait for allocation only when there is no buffer left. */
if ( tcp - > incoming_buffer - > length = = 0 & &
tcp - > incoming_buffer - > count < MAX_READ_IOVEC ) {
if ( grpc_tcp_trace . enabled ( ) ) {
gpr_log ( GPR_INFO , " TCP:%p alloc_slices " , tcp ) ;
@ -544,7 +630,7 @@ static void tcp_handle_read(void* arg /* grpc_tcp */, grpc_error* error) {
}
static void tcp_read ( grpc_endpoint * ep , grpc_slice_buffer * incoming_buffer ,
grpc_closure * cb ) {
grpc_closure * cb , bool urgent ) {
grpc_tcp * tcp = reinterpret_cast < grpc_tcp * > ( ep ) ;
GPR_ASSERT ( tcp - > read_cb = = nullptr ) ;
tcp - > read_cb = cb ;
@ -557,6 +643,11 @@ static void tcp_read(grpc_endpoint* ep, grpc_slice_buffer* incoming_buffer,
* the polling engine */
tcp - > is_first_read = false ;
notify_on_read ( tcp ) ;
} else if ( ! urgent & & tcp - > inq = = 0 ) {
/* Upper layer asked to read more but we know there is no pending data
* to read from previous reads . So , wait for POLLIN .
*/
notify_on_read ( tcp ) ;
} else {
/* Not the first time. We may or may not have more bytes available. In any
* case call tcp - > read_done_closure ( i . e tcp_handle_read ( ) ) which does the
@ -1157,6 +1248,19 @@ grpc_endpoint* grpc_tcp_create(grpc_fd* em_fd,
tcp - > tb_head = nullptr ;
GRPC_CLOSURE_INIT ( & tcp - > read_done_closure , tcp_handle_read , tcp ,
grpc_schedule_on_exec_ctx ) ;
/* Always assume there is something on the queue to read. */
tcp - > inq = 1 ;
# ifdef GRPC_HAVE_TCP_INQ
int one = 1 ;
if ( setsockopt ( tcp - > fd , SOL_TCP , TCP_INQ , & one , sizeof ( one ) ) = = 0 ) {
tcp - > inq_capable = true ;
} else {
gpr_log ( GPR_INFO , " cannot set inq fd=%d errno=%d " , tcp - > fd , errno ) ;
tcp - > inq_capable = false ;
}
# else
tcp - > inq_capable = false ;
# endif /* GRPC_HAVE_TCP_INQ */
/* Start being notified on errors if event engine can track errors. */
if ( grpc_event_engine_can_track_errors ( ) ) {
/* Grab a ref to tcp so that we can safely access the tcp struct when