linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Glauber Costa <glommer@parallels.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	containers@lists.osdl.org, netdev@vger.kernel.org,
	xemul@parallels.com, "David S. Miller" <davem@davemloft.net>,
	"Eric W. Biederman" <ebiederm@xmission.com>
Subject: Re: [PATCH v2 1/9] per-netns ipv4 sysctl_tcp_mem
Date: Fri, 9 Sep 2011 01:19:08 -0300	[thread overview]
Message-ID: <4E6993BC.5030004@parallels.com> (raw)
In-Reply-To: <20110909114720.f050c520.kamezawa.hiroyu@jp.fujitsu.com>

On 09/08/2011 11:47 PM, KAMEZAWA Hiroyuki wrote:
> On Wed,  7 Sep 2011 01:23:11 -0300
> Glauber Costa<glommer@parallels.com>  wrote:
>
>> This patch allows each namespace to independently set up
>> its levels for tcp memory pressure thresholds. This patch
>> alone does not buy much: we need to make this values
>> per group of process somehow. This is achieved in the
>> patches that follows in this patchset.
>>
>> Signed-off-by: Glauber Costa<glommer@parallels.com>
>> CC: David S. Miller<davem@davemloft.net>
>> CC: Hiroyouki Kamezawa<kamezawa.hiroyu@jp.fujitsu.com>
>> CC: Eric W. Biederman<ebiederm@xmission.com>
>
>
> Hmm, it may be better to post this patch as independent one.


Maybe we can search acks from eric about this one
specifically prior to merging, but I'd still like it to be part of
the whole. It will put us in a weird state if this is merged, and
the rest is not.

> I'm not familiar with this area...but try review ;)
Thank you!

>
>> ---
>>   include/net/netns/ipv4.h   |    1 +
>>   include/net/tcp.h          |    1 -
>>   net/ipv4/sysctl_net_ipv4.c |   51 +++++++++++++++++++++++++++++++++++++------
>>   net/ipv4/tcp.c             |   13 +++-------
>>   4 files changed, 49 insertions(+), 17 deletions(-)
>>
>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>> index d786b4f..bbd023a 100644
>> --- a/include/net/netns/ipv4.h
>> +++ b/include/net/netns/ipv4.h
>> @@ -55,6 +55,7 @@ struct netns_ipv4 {
>>   	int current_rt_cache_rebuild_count;
>>
>>   	unsigned int sysctl_ping_group_range[2];
>> +	long sysctl_tcp_mem[3];
>>
>>   	atomic_t rt_genid;
>>   	atomic_t dev_addr_genid;
>
> Hmm, in original placement, sysctl_tcp_mem[] was on __read_mostly
> area. Doesn't this placement cause many cache invalidations ?
>
Yes, you are right. I will move back to the old way of doing it.

>
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index 149a415..6bfdd9b 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
>>   extern int sysctl_tcp_reordering;
>>   extern int sysctl_tcp_ecn;
>>   extern int sysctl_tcp_dsack;
>> -extern long sysctl_tcp_mem[3];
>>   extern int sysctl_tcp_wmem[3];
>>   extern int sysctl_tcp_rmem[3];
>>   extern int sysctl_tcp_app_win;
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index 69fd720..0d74b9d 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -14,6 +14,7 @@
>>   #include<linux/init.h>
>>   #include<linux/slab.h>
>>   #include<linux/nsproxy.h>
>> +#include<linux/swap.h>
>>   #include<net/snmp.h>
>>   #include<net/icmp.h>
>>   #include<net/ip.h>
>> @@ -174,6 +175,36 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
>>   	return ret;
>>   }
>>
>> +static int ipv4_tcp_mem(ctl_table *ctl, int write,
>> +			   void __user *buffer, size_t *lenp,
>> +			   loff_t *ppos)
>> +{
>> +	int ret;
>> +	unsigned long vec[3];
>> +	struct net *net = current->nsproxy->net_ns;
>> +	int i;
>> +
>> +	ctl_table tmp = {
>> +		.data =&vec,
>> +		.maxlen = sizeof(vec),
>> +		.mode = ctl->mode,
>> +	};
>> +
>> +	if (!write) {
>> +		ctl->data =&net->ipv4.sysctl_tcp_mem;
>> +		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
>> +	}
>> +
>> +	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
>> +	if (ret)
>> +		return ret;
>> +
>> +	for (i = 0; i<  3; i++)
>> +		net->ipv4.sysctl_tcp_mem[i] = vec[i];
>> +
>> +	return 0;
>> +}
>> +
>>   static struct ctl_table ipv4_table[] = {
>>   	{
>>   		.procname	= "tcp_timestamps",
>> @@ -433,13 +464,6 @@ static struct ctl_table ipv4_table[] = {
>>   		.proc_handler	= proc_dointvec
>>   	},
>>   	{
>> -		.procname	= "tcp_mem",
>> -		.data		=&sysctl_tcp_mem,
>> -		.maxlen		= sizeof(sysctl_tcp_mem),
>> -		.mode		= 0644,
>> -		.proc_handler	= proc_doulongvec_minmax
>> -	},
>> -	{
>>   		.procname	= "tcp_wmem",
>>   		.data		=&sysctl_tcp_wmem,
>>   		.maxlen		= sizeof(sysctl_tcp_wmem),
>> @@ -721,6 +745,12 @@ static struct ctl_table ipv4_net_table[] = {
>>   		.mode		= 0644,
>>   		.proc_handler	= ipv4_ping_group_range,
>>   	},
>> +	{
>> +		.procname	= "tcp_mem",
>> +		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
>> +		.mode		= 0644,
>> +		.proc_handler	= ipv4_tcp_mem,
>> +	},
>>   	{ }
>>   };
>>
>
>
>
>
>> @@ -734,6 +764,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
>>   static __net_init int ipv4_sysctl_init_net(struct net *net)
>>   {
>>   	struct ctl_table *table;
>> +	unsigned long limit;
>>
>>   	table = ipv4_net_table;
>>   	if (!net_eq(net,&init_net)) {
>> @@ -769,6 +800,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
>>
>>   	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
>>
>> +	limit = nr_free_buffer_pages() / 8;
>> +	limit = max(limit, 128UL);
>> +	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
>> +	net->ipv4.sysctl_tcp_mem[1] = limit;
>> +	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
>> +
>>   	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>>   			net_ipv4_ctl_path, table);
>>   	if (net->ipv4.ipv4_hdr == NULL)
>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>> index 46febca..f06df24 100644
>> --- a/net/ipv4/tcp.c
>> +++ b/net/ipv4/tcp.c
>> @@ -266,6 +266,7 @@
>>   #include<linux/crypto.h>
>>   #include<linux/time.h>
>>   #include<linux/slab.h>
>> +#include<linux/nsproxy.h>
>>
>>   #include<net/icmp.h>
>>   #include<net/tcp.h>
>> @@ -282,11 +283,9 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
>>   struct percpu_counter tcp_orphan_count;
>>   EXPORT_SYMBOL_GPL(tcp_orphan_count);
>>
>> -long sysctl_tcp_mem[3] __read_mostly;
>>   int sysctl_tcp_wmem[3] __read_mostly;
>>   int sysctl_tcp_rmem[3] __read_mostly;
>>
>> -EXPORT_SYMBOL(sysctl_tcp_mem);
>>   EXPORT_SYMBOL(sysctl_tcp_rmem);
>>   EXPORT_SYMBOL(sysctl_tcp_wmem);
>>
>> @@ -3277,14 +3276,10 @@ void __init tcp_init(void)
>>   	sysctl_tcp_max_orphans = cnt / 2;
>>   	sysctl_max_syn_backlog = max(128, cnt / 256);
>>
>> -	limit = nr_free_buffer_pages() / 8;
>> -	limit = max(limit, 128UL);
>> -	sysctl_tcp_mem[0] = limit / 4 * 3;
>> -	sysctl_tcp_mem[1] = limit;
>> -	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
>> -
>>   	/* Set per-socket limits to no more than 1/128 the pressure threshold */
>> -	limit = ((unsigned long)sysctl_tcp_mem[1])<<  (PAGE_SHIFT - 7);
>> +	limit = (unsigned long)init_net.ipv4.sysctl_tcp_mem[1];
>> +	limit<<= (PAGE_SHIFT - 7);
>> +
>
> I'm not sure but...why defined as 'long'  ?
>

It is part of the "it was there
before" bundle.

It is defined as long not only for tcp, but for all of the
equivalents sysctl as well. So no reason to touch it, at least not
in this series =)

>
> BTW, when I grep,
>
> tcp_input.c:        atomic_long_read(&tcp_memory_allocated)<  sysctl_tcp_mem[0])
> tcp_input.c:    if (atomic_long_read(&tcp_memory_allocated)>= sysctl_tcp_mem[0])
>
> Don't you need to change this ?

It ended up being changed in another patch, and I missed the right
split.

Thank you, I will reorder it so it gets changed correctly.
>
>
> Thanks,
> -Kame
>
>
>
>
>
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2011-09-09  4:20 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-09-07  4:23 [PATCH v2 0/9] per-cgroup tcp buffers limitation Glauber Costa
2011-09-07  4:23 ` [PATCH v2 1/9] per-netns ipv4 sysctl_tcp_mem Glauber Costa
2011-09-09  2:47   ` KAMEZAWA Hiroyuki
2011-09-09  4:19     ` Glauber Costa [this message]
2011-09-07  4:23 ` [PATCH v2 2/9] Kernel Memory cgroup Glauber Costa
2011-09-07  5:24   ` Paul Menage
2011-09-07  5:55     ` Glauber Costa
2011-09-07  4:23 ` [PATCH v2 3/9] socket: initial cgroup code Glauber Costa
2011-09-07  5:26   ` Paul Menage
2011-09-07  5:59     ` Glauber Costa
2011-09-07 22:17   ` Kirill A. Shutemov
2011-09-08  4:54     ` Glauber Costa
2011-09-08  5:35       ` Kirill A. Shutemov
2011-09-08 12:41         ` Glauber Costa
2011-09-07  4:23 ` [PATCH v2 4/9] function wrappers for upcoming socket Glauber Costa
2011-09-07  4:23 ` [PATCH v2 5/9] foundations of per-cgroup memory pressure controlling Glauber Costa
2011-09-07  4:23 ` [PATCH v2 6/9] per-cgroup tcp buffers control Glauber Costa
2011-09-07  7:32   ` Li Zefan
2011-09-07 13:02     ` Glauber Costa
2011-09-09  3:12   ` KAMEZAWA Hiroyuki
2011-09-09 12:01     ` Glauber Costa
2011-09-12 10:31       ` KAMEZAWA Hiroyuki
2011-09-07  4:23 ` [PATCH v2 7/9] tcp buffer limitation: per-cgroup limit Glauber Costa
2011-09-07  4:23 ` [PATCH v2 8/9] Display current tcp memory allocation in kmem cgroup Glauber Costa
2011-09-07  4:23 ` [PATCH v2 9/9] Add documentation about kmem_cgroup Glauber Costa
2011-09-08 17:46   ` Randy Dunlap

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4E6993BC.5030004@parallels.com \
    --to=glommer@parallels.com \
    --cc=containers@lists.osdl.org \
    --cc=davem@davemloft.net \
    --cc=ebiederm@xmission.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=netdev@vger.kernel.org \
    --cc=xemul@parallels.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox