2008年8月13日 星期三

Samba Performance Issue Debug Note

Linux Kernel Sendfile() 的提升 Server 效能之路
http://fred-zone.blogspot.com/2011/03/linux-kernel-sendfile-server.html

Kernel 2.6.25 gets splice TCP read support.
http://lists.samba.org/archive/samba-technical/2008-February/057844.html

[Samba] Samba write performance in kernel
http://lists.samba.org/archive/samba/2008-August/143128.html

(Same thread, but continued at Sept,2009)
http://lists.samba.org/archive/samba/2008-September/143362.html

recvfile by linux splice
http://lists.samba.org/archive/samba-technical/2008-September/061408.html


Finally, the file size issue is fixed.
Index: lib/recvfile.c
===================================================================
--- lib/recvfile.c (revision 198)
+++ lib/recvfile.c (working copy)
@@ -148,10 +148,13 @@
static bool try_splice_call = true;
size_t total_written = 0;

+ int pipefd[2];
+ ssize_t ret;
+
DEBUG(10,("sys_recvfile: from = %d, to = %d, "
- "offset=%.0f, count = %lu\n",
+ "offset=%.0f, count = %lu, try_splice_call=%s\n",
fromfd, tofd, (double)offset,
- (unsigned long)count));
+ (unsigned long)count,try_splice_call?"true":"false"));

if (count == 0) {
return 0;
@@ -171,18 +174,27 @@
count);
}

- while (total_written < count) {
- ssize_t ret = splice(fromfd,
- NULL,
- tofd,
- &offset,
- count,
- 0);
+ if((ret=pipe(pipefd))){
+ DEBUG(0,("pipe creation failed,ret=%d(%x)\n",ret,ret));
+ return 0;
+ }
+ while (count!=0) {
+ //splice socket to pipe
+ if((ret = splice( fromfd, NULL,
+ pipefd[1],NULL,
+ count,SPLICE_F_NONBLOCK))>=0){
+ //splice pipe to file
+ ret = splice( pipefd[0], NULL,
+ tofd, &offset,
+ ret, SPLICE_F_NONBLOCK);
+ }
if (ret == -1) {
- if (errno != EINTR) {
+ if (errno != EINTR&&errno!=EAGAIN) {
if (total_written == 0 &&
(errno == EBADF || errno == EINVAL)) {
try_splice_call = false;
+ close(pipefd[0]);
+ close(pipefd[1]);
return default_sys_recvfile(fromfd,
tofd,
offset,
@@ -201,11 +213,15 @@
if (drain_socket(fromfd, count-total_written) !=
count-total_written) {
/* socket is dead. */
+ close(pipefd[0]);
+ close(pipefd[1]);
return -1;
}
errno = saved_errno;
}

+ close(pipefd[0]);
+ close(pipefd[1]);
return total_written;
}
#else

I repeat the test to really going to network and into file, and it success...

# ./splice-fromnet 2001 | ./splice-out -m testfile
# cat bigfile | netcat 2001

So I rewrite the splice-fromnet (splice-fromnet-tofile), and it stuck too.
--- splice-fromnet.c 2007-06-22 21:25:02.000000000 +0800
+++ splice-fromnet-tofile.c.0 2008-09-24 19:55:36.000000000 +0800
@@ -21,6 +21,9 @@
static unsigned int splice_flags;
static int wait_for_poll;

+static char* outputfile="receivedfile";
+static int outputfd;
+static long written_size=0;
static int usage(char *name)
{
fprintf(stderr, "%s: [-s splice size] [-w wait for poll] [-n non-blocking] port\n", name);
@@ -29,8 +32,11 @@

static int splice_from_net(int fd)
{
+ int pipefd[2];
+ if(pipe(pipefd)) return error("pipe");
+
while (1) {
- int ret;
+ int ret,ret2;

if (wait_for_poll) {
struct pollfd pfd = {
@@ -47,9 +53,9 @@
if (!(pfd.revents & POLLIN))
continue;
}
-
- ret = ssplice(fd, NULL, STDOUT_FILENO, NULL, splice_size, 0);
-
+ ret = ssplice(fd, NULL, pipefd[1], NULL, splice_size, 0);
+ ret2 = ssplice(pipefd[0], NULL, outputfd, NULL, ret , 0);
+ written_size+=ret2;
if (ret < 0)
return error("splice");
else if (!ret)
@@ -89,7 +95,7 @@
{
int c, index = 1;

- while ((c = getopt(argc, argv, "s:w:n")) != -1) {
+ while ((c = getopt(argc, argv, "s:w:f:n")) != -1) {
switch (c) {
case 's':
splice_size = atoi(optarg);
@@ -99,6 +105,10 @@
wait_for_poll = atoi(optarg);
index++;
break;
+ case 'f':
+ outputfile = atoi(optarg);
+ index++;
+ break;
case 'n':
splice_flags |= SPLICE_F_NONBLOCK;
index++;
@@ -121,12 +131,12 @@
if (argc < 2)
return usage(argv[0]);

- if (check_output_pipe())
- return usage(argv[0]);
-
index = parse_options(argc, argv);
if (index == -1 || index + 1 > argc)
return usage(argv[0]);
+ outputfd=open(outputfile,O_WRONLY|O_CREAT|O_TRUNC, 0644);
+ if(outputfd<0) return error("open");
+

port = atoi(argv[index]);


I make the following changes to samba-3.2.2. Now splice worked for small file ( <500kB), but larger files might failed (>800kB).
Index: lib/recvfile.c
===================================================================
--- lib/recvfile.c (revision 198)
+++ lib/recvfile.c (working copy)
@@ -148,10 +148,13 @@
static bool try_splice_call = true;
size_t total_written = 0;

- DEBUG(10,("sys_recvfile: from = %d, to = %d, "
- "offset=%.0f, count = %lu\n",
+ int filepipe[2];
+ ssize_t ret;
+
+ DEBUG(0,("sys_recvfile: from = %d, to = %d, "
+ "offset=%.0f, count = %lu, try_splice_call=%s\n",
fromfd, tofd, (double)offset,
- (unsigned long)count));
+ (unsigned long)count,try_splice_call?"true":"false"));

if (count == 0) {
return 0;
@@ -171,13 +174,18 @@
count);
}

- while (total_written < count) {
- ssize_t ret = splice(fromfd,
- NULL,
- tofd,
- &offset,
- count,
- 0);
+ if(pipe(filepipe)) return 0;
+
+ while (count!=0) {
+ //splice socket to pipe
+ if((ret = splice( fromfd, NULL,
+ filepipe[1],NULL,
+ count,0))>=0){
+ //splice pipe to file
+ ret = splice( filepipe[0], NULL,
+ tofd, &offset,
+ ret, 0);
+ }
if (ret == -1) {
if (errno != EINTR) {
if (total_written == 0 &&


sys_splice -> do_splice returns -EINVAL

do_splice would check that one of in and out fd have to be a PIPE(FIFO), however, both parameter (fromfd, tofd) that samba sent with splice failed the check, so do_splice returns -EINVAL.

linux-2.6.26.5/include/linux/stat.h
  29#define S_ISFIFO(m)     (((m) & S_IFMT) == S_IFIFO)

linux-2.6.26.5/fs/splice.c
1084
1085/*
1086 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1087 * location, so checking ->i_pipe is not enough to verify that this is a
1088 * pipe.
1089 */
1090static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1091{
1092 if (S_ISFIFO(inode->i_mode))
1093 return inode->i_pipe;
1094
1095 return NULL;
1096}
1097
1098/*
1099 * Determine where to splice to/from.
1100 */
1101static long do_splice(struct file *in, loff_t __user *off_in,
1102 struct file *out, loff_t __user *off_out,
1103 size_t len, unsigned int flags)
1104{
1105 struct pipe_inode_info *pipe;
1106 loff_t offset, *off;
1107 long ret;
1108
1109 pipe = pipe_info(in->f_path.dentry->d_inode);
1110 if (pipe) {
1111 if (off_in)
1112 return -ESPIPE;
1113 if (off_out) {
1114 if (out->f_op->llseek == no_llseek)
1115 return -EINVAL;
1116 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1117 return -EFAULT;
1118 off = &offset;
1119 } else
1120 off = &out->f_pos;
1121
1122 ret = do_splice_from(pipe, out, off, len, flags);
1123
1124 if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1125 ret = -EFAULT;
1126
1127 return ret;
1128 }
1129
1130 pipe = pipe_info(out->f_path.dentry->d_inode);
1131 if (pipe) {
1132 if (off_out)
1133 return -ESPIPE;
1134 if (off_in) {
1135 if (in->f_op->llseek == no_llseek)
1136 return -EINVAL;
1137 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1138 return -EFAULT;
1139 off = &offset;
1140 } else
1141 off = &in->f_pos;
1142
1143 ret = do_splice_to(in, off, pipe, len, flags);
1144
1145 if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1146 ret = -EFAULT;
1147
1148 return ret;
1149 }
1150
1151 return -EINVAL;
1152}
1153
(..............................)
1486asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1487 int fd_out, loff_t __user *off_out,
1488 size_t len, unsigned int flags)
1489{
1490 long error;
1491 struct file *in, *out;
1492 int fput_in, fput_out;
1493
1494 if (unlikely(!len))
1495 return 0;
1496
1497 error = -EBADF;
1498 in = fget_light(fd_in, &fput_in);
1499 if (in) {
1500 if (in->f_mode & FMODE_READ) {
1501 out = fget_light(fd_out, &fput_out);
1502 if (out) {
1503 if (out->f_mode & FMODE_WRITE)
1504 error = do_splice(in, off_in,
1505 out, off_out,
1506 len, flags);
1507 fput_light(out, fput_out);
1508 }
1509 }
1510
1511 fput_light(in, fput_in);
1512 }
1513
1514 return error;
1515}
1516


Doesn't work even up to 2.6.27-rc5.
http://bbs.chinaunix.net/viewthread.php?tid=1079572
要到2.6.25以後?



Samba 3.2.2 has support for splice for receive file, but it doesn't work yet.

samba-3.2.2/source/lib/recvfile.c
#if defined(HAVE_LINUX_SPLICE)

/*
* Try and use the Linux system call to do this.
* Remember we only return -1 if the socket read
* failed. Else we return the number of bytes
* actually written. We always read count bytes
* from the network in the case of return != -1.
*/


ssize_t sys_recvfile(int fromfd,
int tofd,
SMB_OFF_T offset,
size_t count)
{
static bool try_splice_call = true;
size_t total_written = 0;

DEBUG(10,("sys_recvfile: from = %d, to = %d, "
"offset=%.0f, count = %lu, try_splice_call=%s\n",
fromfd, tofd, (double)offset,
(unsigned long)count,try_splice_call?"true":"false"));

if (count == 0) {
return 0;
}

/*
* Older Linux kernels have splice for sendfile,
* but it fails for recvfile. Ensure we only try
* this once and always fall back to the userspace
* implementation if recvfile splice fails. JRA.
*/

if (!try_splice_call) {
return default_sys_recvfile(fromfd,
tofd,
offset,
count);
}

while (total_written < count) {
ssize_t ret = splice(fromfd,
NULL,
tofd,
&offset,
count,
0);
if (ret == -1) {
if (errno != EINTR) {
if (total_written == 0 &&
(errno == EBADF || errno == EINVAL)) {
try_splice_call = false;
return default_sys_recvfile(fromfd,
tofd,
offset,
count);
}
break;
}
continue;
}
total_written += ret;
count -= ret;
}

if (total_written < count) {
int saved_errno = errno;
if (drain_socket(fromfd, count-total_written) !=
count-total_written) {
/* socket is dead. */
return -1;
}
errno = saved_errno;
}

return total_written;
}
#else

/*****************************************************************
No recvfile system call - use the default 128 chunk implementation.
*****************************************************************/

ssize_t sys_recvfile(int fromfd,
int tofd,
SMB_OFF_T offset,
size_t count)
{
return default_sys_recvfile(fromfd, tofd, offset, count);
}
#endif
and enable the option in smb.conf
min receivefile size=1

man smb.conf.5
       min receivefile size (G)

This option changes the behavior of smbd(8) when processing SMBwriteX calls. Any incoming SMBwriteX call on a non-signed SMB/CIFS connection
greater than this value will not be processed in the normal way but will be passed to any underlying kernel recvfile or splice system call (if
there is no such call Samba will emulate in user space). This allows zero-copy writes directly from network socket buffers into the filesystem
buffer cache, if available. It may improve performance but user testing is recommended. If set to zero Samba processes SMBwriteX calls in the
normal way. To enable POSIX large write support (SMB/CIFS writes up to 16Mb) this option must be nonzero. The maximum value is 128k. Values
greater than 128k will be silently set to 128k.

Note this option will have NO EFFECT if set on a SMB signed connection.

The default is zero, which diables this option.

Default: min receivefile size = 0



Linux has splice support since 2.6.17
http://lists.samba.org/archive/samba/2006-December/127887.html
http://kerneltrap.org/node/6505.
http://lwn.net/Articles/181170/




linux-2.6.16/net/socket.c

/*
* Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
* in the operation structures but are done directly via the socketcall() multiplexor.
*/

static struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.aio_read = sock_aio_read,
.aio_write = sock_aio_write,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
.mmap = sock_mmap,
.open = sock_no_open, /* special open code to disallow open via /proc */
.release = sock_close,
.fasync = sock_fasync,
.readv = sock_readv,
.writev = sock_writev,
.sendpage = sock_sendpage
};




kernel question
1. packet收到後放在哪? (socket裡的queue?)
2. socket如何收 (sys_read socket?)

userspace
1. call sys_read socket到sys_write disk的過程



Samba write file
call frequency 1>2>>3>4

1. smbd calling sys_read socket to get received data
__arch_copy_to_user
copy_to_user
memcpy_toiovec
skb_copy_datagram_iovec
tcp_rcv_established
tcp_v4_do_rcv
release_sock
tcp_recvmsg
sock_common_recvmsg
sock_aio_read
do_sync_read
vfs_read
sys_read
ret_fast_syscall

2. smbd writes received data to disk
__arch_copy_from_user
__copy_from_user
generic_file_buffered_write
__generic_file_aio_write_nolock
generic_file_write
vfs_write
sys_write
ret_fast_syscall

3. sys_fcntl
__arch_copy_from_user
copy_from_user
fcntl_getlk
do_fcntl
sys_fcntl
ret_fast_syscall

4. Unknow, what is it reading???
__arch_copy_to_user
__copy_to_user
file_read_actor
do_generic_mapping_read
__generic_file_aio_read
generic_file_read
vfs_read
sys_read
ret_fast_syscall



./arch/arm/lib/uaccess.S (not built, no obj file)
ENTRY(__arch_copy_to_user)
stmfd sp!, {r2, r4 - r7, lr}
cmp r2, #4
blt .Lc2u_not_enough
ands ip, r0, #3
bne .Lc2u_dest_not_aligned


./arch/arm/lib/copy_to_user.S
ENTRY(__arch_copy_to_user)

#include "copy_template.S"

.section .fixup,"ax"
.align 0
copy_abort_preamble
ldmfd sp!, {r1, r2, r3}
sub r0, r0, r1
rsb r0, r0, r2
copy_abort_end
.previous


./include/asm-arm/uaccess.h
extern unsigned long __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
extern unsigned long __arch_copy_to_user(void __user *to, const void *from, unsigned long n);

(............................................)

static inline unsigned long copy_from_user(void *to, const void __user *from, unsigned long n)
{
if (access_ok(VERIFY_READ, from, n))
n = __arch_copy_from_user(to, from, n);
else /* security hole - plug it */
memzero(to, n);
return n;
}

static inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n)
{
return __arch_copy_from_user(to, from, n);
}

static inline unsigned long copy_to_user(void __user *to, const void *from, unsigned long n)
{
if (access_ok(VERIFY_WRITE, to, n))
n = __arch_copy_to_user(to, from, n);
return n;
}

static inline unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n)
{
return __arch_copy_to_user(to, from, n);
}

沒有留言: