Welcome to Soft32 Linux Forums!
FAQFAQ    SearchSearch      ProfileProfile    Private MessagesPrivate Messages   Log inLog in

[PATCH 1/3] sysfs directory scaling: rbtree for dirent nam..

 
Goto page 1, 2
   Soft32 Home -> Linux -> Kernel RSS
Next:  [GIT PULL] perf events fixes  
Author Message
Benjamin LaHaise

External


Since: Nov 01, 2009
Posts: 7



(Msg. 1) Posted: Sun Nov 01, 2009 11:20 am
Post subject: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups
Archived from groups: linux>kernel (more info?)

Use an rbtree in sysfs_dirent to speed up file lookup times

Systems with large numbers (tens of thousands and more) of network
interfaces stress the sysfs code in ways that make the linear search for
a name match take far too long. Avoid this by using an rbtree.

Signed-off-by: Benjamin LaHaise <bcrl DeleteThis @kvack.org>
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5fad489..30c3fc5 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -44,6 +44,7 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
{
struct sysfs_dirent *parent_sd = sd->s_parent;
struct sysfs_dirent **pos;
+ struct rb_node **new, *parent;

BUG_ON(sd->s_sibling);

@@ -57,6 +58,27 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
}
sd->s_sibling = *pos;
*pos = sd;
+
+ // rb tree insert
+ new = &(parent_sd->s_dir.child_rb_root.rb_node);
+ parent = NULL;
+
+ while (*new) {
+ struct sysfs_dirent *this =
+ container_of(*new, struct sysfs_dirent, s_rb_node);
+ int result = strcmp(sd->s_name, this->s_name);
+
+ parent = *new;
+ if (result < 0)
+ new = &((*new)->rb_left);
+ else if (result > 0)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ rb_link_node(&sd->s_rb_node, parent, new);
+ rb_insert_color(&sd->s_rb_node, &parent_sd->s_dir.child_rb_root);
}

/**
@@ -81,6 +103,8 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
break;
}
}
+
+ rb_erase(&sd->s_rb_node, &sd->s_parent->s_dir.child_rb_root);
}

/**
@@ -331,6 +355,9 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
sd->s_mode = mode;
sd->s_flags = type;

+ if (type == SYSFS_DIR)
+ sd->s_dir.child_rb_root = RB_ROOT;
+
return sd;

err_out2:
@@ -630,11 +657,20 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
const unsigned char *name)
{
- struct sysfs_dirent *sd;
-
- for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
- if (!strcmp(sd->s_name, name))
- return sd;
+ struct rb_node *node = parent_sd->s_dir.child_rb_root.rb_node;
+
+ while (node) {
+ struct sysfs_dirent *data =
+ container_of(node, struct sysfs_dirent, s_rb_node);
+ int result;
+ result = strcmp(name, data->s_name);
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return data;
+ }
return NULL;
}

diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index af4c4e7..600109c 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
*/

#include <linux/fs.h>
+#include <linux/rbtree.h>

struct sysfs_open_dirent;

@@ -17,6 +18,7 @@ struct sysfs_elem_dir {
struct kobject *kobj;
/* children list starts here and goes through sd->s_sibling */
struct sysfs_dirent *children;
+ struct rb_root child_rb_root;
};

struct sysfs_elem_symlink {
@@ -52,6 +54,7 @@ struct sysfs_dirent {
atomic_t s_active;
struct sysfs_dirent *s_parent;
struct sysfs_dirent *s_sibling;
+ struct rb_node s_rb_node;
const char *s_name;

union {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo DeleteThis @vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Benjamin LaHaise

External


Since: Nov 01, 2009
Posts: 7



(Msg. 2) Posted: Sun Nov 01, 2009 11:20 am
Post subject: [PATCH 2/3] sysfs directory scaling: doubly linked list for dirents [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

When adding/removing large numbers of entries from sysfs directories, one
hot spot is in the link and unlink operations of sysfs. When linking a
new sysfs_dirent into the tree, operation can be significantly sped up by
starting at the end of the list rather than the beginning. Unlink is
improved by using a doubly linked list.

Signed-off-by: Benjamin LaHaise <bcrl.DeleteThis@kvack.org>
diff -u b/fs/sysfs/dir.c b/fs/sysfs/dir.c
--- b/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,11 +43,18 @@
static void sysfs_link_sibling(struct sysfs_dirent *sd)
{
struct sysfs_dirent *parent_sd = sd->s_parent;
- struct sysfs_dirent **pos;
+ struct sysfs_dirent **pos, *prev = NULL;
struct rb_node **new, *parent;

BUG_ON(sd->s_sibling);

+ if (parent_sd->s_dir.children_tail &&
+ parent_sd->s_dir.children_tail->s_ino < sd->s_ino) {
+ prev = parent_sd->s_dir.children_tail;
+ pos = &prev->s_sibling;
+ goto got_it;
+ }
+
/* Store directory entries in order by ino. This allows
* readdir to properly restart without having to add a
* cursor into the s_dir.children list.
@@ -55,8 +62,13 @@
for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) {
if (sd->s_ino < (*pos)->s_ino)
break;
+ prev = *pos;
}
+got_it:
+ if (prev == parent_sd->s_dir.children_tail)
+ parent_sd->s_dir.children_tail = sd;
sd->s_sibling = *pos;
+ sd->s_sibling_prev = prev;
*pos = sd;

// rb tree insert
@@ -93,17 +105,20 @@
*/
static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
{
- struct sysfs_dirent **pos;
-
- for (pos = &sd->s_parent->s_dir.children; *pos;
- pos = &(*pos)->s_sibling) {
- if (*pos == sd) {
- *pos = sd->s_sibling;
- sd->s_sibling = NULL;
- break;
- }
- }
+ struct sysfs_dirent **pos, *prev = NULL;

+ prev = sd->s_sibling_prev;
+ if (prev)
+ pos = &prev->s_sibling;
+ else
+ pos = &sd->s_parent->s_dir.children;
+ if (sd == sd->s_parent->s_dir.children_tail)
+ sd->s_parent->s_dir.children_tail = prev;
+ *pos = sd->s_sibling;
+ if (sd->s_sibling)
+ sd->s_sibling->s_sibling_prev = prev;
+
+ sd->s_sibling_prev = NULL;
rb_erase(&sd->s_rb_node, &sd->s_parent->s_dir.child_rb_root);
}

diff -u b/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
--- b/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -17,7 +17,9 @@
struct sysfs_elem_dir {
struct kobject *kobj;
/* children list starts here and goes through sd->s_sibling */
+
struct sysfs_dirent *children;
+ struct sysfs_dirent *children_tail;
struct rb_root child_rb_root;
};

@@ -54,6 +56,7 @@
atomic_t s_active;
struct sysfs_dirent *s_parent;
struct sysfs_dirent *s_sibling;
+ struct sysfs_dirent *s_sibling_prev;
struct rb_node s_rb_node;
const char *s_name;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo.DeleteThis@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Benjamin LaHaise

External


Since: Nov 01, 2009
Posts: 7



(Msg. 3) Posted: Sun Nov 01, 2009 11:20 am
Post subject: [PATCH 2/3] sysfs directory scaling: count number of children dirs [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

sysfs_count_nlink() is a bottleneck during mass bring up of network
interfaces. Eliminate this problem by tracking the number of directories.

Signed-off-by: Benjamin LaHaise <bcrl DeleteThis @kvack.org>
diff -u b/fs/sysfs/dir.c b/fs/sysfs/dir.c
--- b/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -70,6 +70,7 @@
sd->s_sibling = *pos;
sd->s_sibling_prev = prev;
*pos = sd;
+ parent_sd->s_nr_children_dir += (sysfs_type(sd) == SYSFS_DIR);

// rb tree insert
new = &(parent_sd->s_dir.child_rb_root.rb_node);
@@ -118,6 +119,7 @@
if (sd->s_sibling)
sd->s_sibling->s_sibling_prev = prev;

+ sd->s_parent->s_nr_children_dir -= (sysfs_type(sd) == SYSFS_DIR);
sd->s_sibling_prev = NULL;
rb_erase(&sd->s_rb_node, &sd->s_parent->s_dir.child_rb_root);
}
diff -u b/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
--- b/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -71,6 +71,8 @@
ino_t s_ino;
umode_t s_mode;
struct sysfs_inode_attrs *s_iattr;
+
+ int s_nr_children_dir;
};

#define SD_DEACTIVATED_BIAS INT_MIN
only in patch2:
unchanged:
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -191,14 +191,7 @@ static struct lock_class_key sysfs_inode_imutex_key;

static int sysfs_count_nlink(struct sysfs_dirent *sd)
{
- struct sysfs_dirent *child;
- int nr = 0;
-
- for (child = sd->s_dir.children; child; child = child->s_sibling)
- if (sysfs_type(child) == SYSFS_DIR)
- nr++;
-
- return nr + 2;
+ return sd->s_nr_children_dir + 2;
}

static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo DeleteThis @vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Greg KH

External


Since: Aug 31, 2005
Posts: 723



(Msg. 4) Posted: Mon Nov 02, 2009 11:20 pm
Post subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
> Use an rbtree in sysfs_dirent to speed up file lookup times
>
> Systems with large numbers (tens of thousands and more) of network
> interfaces stress the sysfs code in ways that make the linear search for
> a name match take far too long. Avoid this by using an rbtree.

What kind of speedups are you seeing here? And do these changes cause a
memory increase due to the structure changes which outweigh the
speedups?

What kind of test are you doing to reproduce this?

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo RemoveThis @vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Eric Dumazet

External


Since: Jul 03, 2009
Posts: 39



(Msg. 5) Posted: Tue Nov 03, 2009 4:55 am
Post subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

Greg KH a écrit :
> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
>> Use an rbtree in sysfs_dirent to speed up file lookup times
>>
>> Systems with large numbers (tens of thousands and more) of network
>> interfaces stress the sysfs code in ways that make the linear search for
>> a name match take far too long. Avoid this by using an rbtree.
>
> What kind of speedups are you seeing here? And do these changes cause a
> memory increase due to the structure changes which outweigh the
> speedups?
>
> What kind of test are you doing to reproduce this?
>

Its curious because in my tests the biggest problems come from
kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
in following attempt to create 20.000 devices

(disable hotplug before trying this, and ipv6 too !)
modprobe dummy numdummies=20000

I believe we should address __register_sysctl_paths() scalability
problems too.

I dont know what is the 'sentinel' we allocate after each struct ctl_table
But I suspect we could reduce size requirement of the 'sentinel' to include
only needed fields for the sentinel (and move them at start of ctl_table)

/*
* For each path component, allocate a 2-element ctl_table array.
* The first array element will be filled with the sysctl entry
* for this, the second will be the sentinel (ctl_name == 0).
*
* We allocate everything in one go so that we don't have to
* worry about freeing additional memory in unregister_sysctl_table.
*/
header = kzalloc(sizeof(struct ctl_table_header) +
(2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);

Then, adding an rb_node in ctl_table_header to speedup __register_sysctl_paths() a bit
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo DeleteThis @vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Eric Dumazet

External


Since: Jul 03, 2009
Posts: 39



(Msg. 6) Posted: Tue Nov 03, 2009 4:55 am
Post subject: [PATCH] sysctl: reduce ram usage by 40 % [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

Eric Dumazet a écrit :

> Its curious because in my tests the biggest problems come from
> kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
> in following attempt to create 20.000 devices
>
> (disable hotplug before trying this, and ipv6 too !)
> modprobe dummy numdummies=20000
>
> I believe we should address __register_sysctl_paths() scalability
> problems too.
>
> I dont know what is the 'sentinel' we allocate after each struct ctl_table
> But I suspect we could reduce size requirement of the 'sentinel' to include
> only needed fields for the sentinel (and move them at start of ctl_table)
>

Here is the patch to reduce ram usage of sysctl :

[PATCH] sysctl: reduce ram usage by 40 %

We currently reserve space for a so called sentinel, a full struct ctl_table
for each ctl_table. We can cheat a bit since only needed fields of a sentinel
are ctl_name and procname. Add a new structure (struct ctl_table_sentinel)
that includes a full ctl_table and only required part of a sentinel.

Signed-off-by: Eric Dumazet <eric.dumazet.TakeThisOut@gmail.com>
---
include/linux/sysctl.h | 13 ++++++++++++-
kernel/sysctl.c | 19 ++++++++++---------
2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 1e4743e..6a1b1d5 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1050,8 +1050,10 @@ extern ctl_handler sysctl_ms_jiffies;
/* A sysctl table is an array of struct ctl_table: */
struct ctl_table
{
- int ctl_name; /* Binary ID */
+ /* ctl_name and procname must be first fields (check sentinel) */
+ int ctl_name; /* Binary ID */
const char *procname; /* Text ID for /proc/sys, or zero */
+
void *data;
int maxlen;
mode_t mode;
@@ -1063,6 +1065,15 @@ struct ctl_table
void *extra2;
};

+/* ctl_table_sentinel : a ctl_table followed by a sentinel
+ * (null ctl & procname)
+ */
+struct ctl_table_sentinel {
+ struct ctl_table table;
+ int ctl_name;
+ const char *procname;
+};
+
struct ctl_table_root {
struct list_head root_list;
struct ctl_table_set default_set;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d949c5..5d29dd8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2063,7 +2063,8 @@ struct ctl_table_header *__register_sysctl_paths(
const struct ctl_path *path, struct ctl_table *table)
{
struct ctl_table_header *header;
- struct ctl_table *new, **prevp;
+ struct ctl_table_sentinel *new;
+ struct ctl_table **prevp;
unsigned int n, npath;
struct ctl_table_set *set;

@@ -2080,24 +2081,24 @@ struct ctl_table_header *__register_sysctl_paths(
* worry about freeing additional memory in unregister_sysctl_table.
*/
header = kzalloc(sizeof(struct ctl_table_header) +
- (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
+ (npath * sizeof(struct ctl_table_sentinel)), GFP_KERNEL);
if (!header)
return NULL;

- new = (struct ctl_table *) (header + 1);
+ new = (struct ctl_table_sentinel *) (header + 1);

/* Now connect the dots */
prevp = &header->ctl_table;
for (n = 0; n < npath; ++n, ++path) {
/* Copy the procname */
- new->procname = path->procname;
- new->ctl_name = path->ctl_name;
- new->mode = 0555;
+ new->table.procname = path->procname;
+ new->table.ctl_name = path->ctl_name;
+ new->table.mode = 0555;

- *prevp = new;
- prevp = &new->child;
+ *prevp = &new->table;
+ prevp = &new->table.child;

- new += 2;
+ new++;
}
*prevp = table;
header->ctl_table_arg = table;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo.TakeThisOut@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Eric W. Biederman

External


Since: Nov 04, 2006
Posts: 795



(Msg. 7) Posted: Tue Nov 03, 2009 4:56 am
Post subject: Re: [PATCH] sysctl: reduce ram usage by 40 % [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

Eric Dumazet <eric.dumazet DeleteThis @gmail.com> writes:

> Eric Dumazet a écrit :
>
>> Its curious because in my tests the biggest problems come from
>> kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
>> in following attempt to create 20.000 devices

I bet that is Al's cute glue all the sysctl data structures together
patch. It improves readdir and lookup at a small cost at registration
time.

>> (disable hotplug before trying this, and ipv6 too !)
>> modprobe dummy numdummies=20000


>> I believe we should address __register_sysctl_paths() scalability
>> problems too.

Agreed.

>> I dont know what is the 'sentinel' we allocate after each struct ctl_table
>> But I suspect we could reduce size requirement of the 'sentinel' to include
>> only needed fields for the sentinel (and move them at start of ctl_table)

The sentinel is just a NULL terminator.

> Here is the patch to reduce ram usage of sysctl :
>
> [PATCH] sysctl: reduce ram usage by 40 %
>
> We currently reserve space for a so called sentinel, a full struct ctl_table
> for each ctl_table. We can cheat a bit since only needed fields of a sentinel
> are ctl_name and procname. Add a new structure (struct ctl_table_sentinel)
> that includes a full ctl_table and only required part of a sentinel.

Before we address sysctl I would like to get out my patchset that
makes sys_sysctl a wrapper around the ascii version of
/proc/sys/net. Once that goes in it becomes much easier to do things
and perform radical surgery on sysctl. Little things like .ctl_name and
..strategy go away.

Have you happened to look at the other cost of /proc proper? Hmm.
Except for /proc/net/dev_snmp6 it doesn't look like we keep per
interface directories in proc so without ivp6 you won't see the proc
generic code at all.

The practical consequence is if /proc/net/dev_snmp6 is not painful during
registration right now we can probably convert all of /proc/sys/net to proc
generic after my other changes are in.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo DeleteThis @vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Eric W. Biederman

External


Since: Nov 04, 2006
Posts: 795



(Msg. 8) Posted: Tue Nov 03, 2009 4:56 am
Post subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

Benjamin LaHaise <bcrl RemoveThis @lhnet.ca> writes:

> Use an rbtree in sysfs_dirent to speed up file lookup times
>
> Systems with large numbers (tens of thousands and more) of network
> interfaces stress the sysfs code in ways that make the linear search for
> a name match take far too long. Avoid this by using an rbtree.

Please take a look at the cleanups_scaling branch at:
kernel.org:/pub/scm/linux/kernel/git/ebiederm/linux-2.6.32-rc5-sysfs-enhancements

I haven't spent a lot of time on it but it is possible to get everything
except the rbtree without increasing the size of sysfs_dirent. Also we
don't need the both the rbtree and a linked list.

In particular see:
commit 50623bbb82da3bd1d596b9173a91ed1b5aa168b8
Author: Eric W. Biederman <ebiederm RemoveThis @maxwell.aristanetworks.com>
Date: Sat Oct 31 04:11:18 2009 -0700

sysfs: Sort sysfs directories by name hash.

This is a step in preparation for introducing a more efficient
data structure than a linked list for sysfs entries. By ordering
by name hash instead of by inode sysfs_lookup can be speeded
up as well as allowing restarting after seekdir.

Signed-off-by: Eric W. Biederman <ebiederm RemoveThis @aristanetworks.com>

Meanwhile back to pushing the most important ones for real.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo RemoveThis @vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Greg KH

External


Since: Aug 31, 2005
Posts: 723



(Msg. 9) Posted: Tue Nov 03, 2009 11:20 am
Post subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

On Tue, Nov 03, 2009 at 07:14:33AM +0100, Eric Dumazet wrote:
> Greg KH a ?crit :
> > On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
> >> Use an rbtree in sysfs_dirent to speed up file lookup times
> >>
> >> Systems with large numbers (tens of thousands and more) of network
> >> interfaces stress the sysfs code in ways that make the linear search for
> >> a name match take far too long. Avoid this by using an rbtree.
> >
> > What kind of speedups are you seeing here? And do these changes cause a
> > memory increase due to the structure changes which outweigh the
> > speedups?
> >
> > What kind of test are you doing to reproduce this?
> >
>
> Its curious because in my tests the biggest problems come from
> kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
> in following attempt to create 20.000 devices
>
> (disable hotplug before trying this, and ipv6 too !)
> modprobe dummy numdummies=20000
>
> I believe we should address __register_sysctl_paths() scalability
> problems too.

But registering 20000 devices is a far different problem from using
those 20000 devices Smile

I think the "use the device" path should be the one we care the most
about fixing up, as that is much more common than the register path for
all users.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo.RemoveThis@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Octavian Purdila

External


Since: Nov 03, 2009
Posts: 1



(Msg. 10) Posted: Tue Nov 03, 2009 11:20 am
Post subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

On Tuesday 03 November 2009 18:07:15 you wrote:

> > > What kind of test are you doing to reproduce this?
> >
> > Its curious because in my tests the biggest problems come from
> > kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
> > in following attempt to create 20.000 devices
> >
> > (disable hotplug before trying this, and ipv6 too !)
> > modprobe dummy numdummies=20000
> >
> > I believe we should address __register_sysctl_paths() scalability
> > problems too.
>
> But registering 20000 devices is a far different problem from using
> those 20000 devices Smile
>
> I think the "use the device" path should be the one we care the most
> about fixing up, as that is much more common than the register path for
> all users.
>

For sysctl in general probably, but I would argue that for dynamic network
interfaces (ppp and other sorts of tunnels) the "use" and "register" paths are
not that unbalanced.

For our case where we use up to 128K interfaces, sysctl entries per network
interface is pretty much unusable - but I agree that is not a very common case
Smile

However [1] is not so far fetched.

[1] http://www.spinics.net/lists/netdev/msg110392.html

Thanks,
tavi
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo.RemoveThis@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Benjamin LaHaise

External


Since: Nov 01, 2009
Posts: 7



(Msg. 11) Posted: Tue Nov 03, 2009 11:20 am
Post subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

On Tue, Nov 03, 2009 at 08:07:15AM -0800, Greg KH wrote:
> But registering 20000 devices is a far different problem from using
> those 20000 devices Smile

Registering 20,000 devices *is* a real world problem (I'm actually aiming
for 100,000, as that's what roughly fits in a single 10Gbps link -- something
that a mid range system can now route). When an edge router comes up from
reboot, or after a link has been down, the rate at which customers connect
is important -- too slow, and you get a pile of support calls from customers
complaining that their connection is down. Because of the data structures
used, there isn't even any improvement from an SMP system, so this needs
to be addressed directly.

-ben
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo.DeleteThis@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Greg KH

External


Since: Aug 31, 2005
Posts: 723



(Msg. 12) Posted: Tue Nov 03, 2009 3:20 pm
Post subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

On Tue, Nov 03, 2009 at 11:45:50AM -0500, Benjamin LaHaise wrote:
> On Tue, Nov 03, 2009 at 08:07:15AM -0800, Greg KH wrote:
> > But registering 20000 devices is a far different problem from using
> > those 20000 devices Smile
>
> Registering 20,000 devices *is* a real world problem (I'm actually aiming
> for 100,000, as that's what roughly fits in a single 10Gbps link -- something
> that a mid range system can now route). When an edge router comes up from
> reboot, or after a link has been down, the rate at which customers connect
> is important -- too slow, and you get a pile of support calls from customers
> complaining that their connection is down. Because of the data structures
> used, there isn't even any improvement from an SMP system, so this needs
> to be addressed directly.

Ok, how long are we talking about here?

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo.RemoveThis@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Benjamin LaHaise

External


Since: Nov 01, 2009
Posts: 7



(Msg. 13) Posted: Tue Nov 03, 2009 3:20 pm
Post subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

On Mon, Nov 02, 2009 at 07:50:58PM -0800, Greg KH wrote:
> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
> > Use an rbtree in sysfs_dirent to speed up file lookup times
> >
> > Systems with large numbers (tens of thousands and more) of network
> > interfaces stress the sysfs code in ways that make the linear search for
> > a name match take far too long. Avoid this by using an rbtree.
>
> What kind of speedups are you seeing here? And do these changes cause a
> memory increase due to the structure changes which outweigh the
> speedups?

Depends on the number of interfaces being created. Without the patch,
interface creation time tends to double or worse for every 5,000-10,000
additional network interfaces.

> What kind of test are you doing to reproduce this?

I'm creating 30,000+ network interfaces, with the goal being 100,000.
With other hacks in the tree to get around the sysctl and procfs scaling
issues, as well as disabling things like NetworkManager, the results look
as follows:

Interfaces no-rb rbtree rbtree+list
0-5,000 13.8s 14.0s 13.0s
5,000-10,000 20.0s 17.4s 14.4s
10,000-15,000 27.3s 24.1s 16.9s
15,000-20,000 36.3s 32.2s 19.7s
20,000-25,000 45.2s 40.0s 22.9s
25,000-30,000 54.2s 48.2s 26.6s
30,000-35,000 63.9s 54.9s 30.7s

Thoughts?

-ben
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo.TakeThisOut@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Eric W. Biederman

External


Since: Nov 04, 2006
Posts: 795



(Msg. 14) Posted: Tue Nov 03, 2009 5:20 pm
Post subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

Benjamin LaHaise <bcrl.DeleteThis@lhnet.ca> writes:

> On Mon, Nov 02, 2009 at 07:50:58PM -0800, Greg KH wrote:
>> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
>> > Use an rbtree in sysfs_dirent to speed up file lookup times
>> >
>> > Systems with large numbers (tens of thousands and more) of network
>> > interfaces stress the sysfs code in ways that make the linear search for
>> > a name match take far too long. Avoid this by using an rbtree.
>>
>> What kind of speedups are you seeing here? And do these changes cause a
>> memory increase due to the structure changes which outweigh the
>> speedups?
>
> Depends on the number of interfaces being created. Without the patch,
> interface creation time tends to double or worse for every 5,000-10,000
> additional network interfaces.
>
>> What kind of test are you doing to reproduce this?
>
> I'm creating 30,000+ network interfaces, with the goal being 100,000.
> With other hacks in the tree to get around the sysctl and procfs scaling
> issues, as well as disabling things like NetworkManager, the results look
> as follows:
>
> Interfaces no-rb rbtree rbtree+list
> 0-5,000 13.8s 14.0s 13.0s
> 5,000-10,000 20.0s 17.4s 14.4s
> 10,000-15,000 27.3s 24.1s 16.9s
> 15,000-20,000 36.3s 32.2s 19.7s
> 20,000-25,000 45.2s 40.0s 22.9s
> 25,000-30,000 54.2s 48.2s 26.6s
> 30,000-35,000 63.9s 54.9s 30.7s
>
> Thoughts?

Something is very weird. I just took your no-rb numbers
and divided by the number of interfaces to see how well we are
scaling. I got:

Interfaces per-interface cost
5,000 0.002760s
10,000 0.002000s
15,000 0.001820s
20,000 0.001815s
25,000 0.001808s
30,000 0.001807s
35,000 0.001826s

I ran a variant of this test a long time ago and at that the
cost per interface grew with additional interfaces added. This
jives with the fact that the fundamental cost of adding N
network interfaces to sysfs is O(N^2).

Are your numbers from your application and are they real world?
In which case they are interesting, but it would be good if
we could also have microbenchmark numbers that just measure
the sysfs costs. If nothing else I am seeing a big startup
overhead that isn't being subtracted out that makes it hard
to see the real costs here.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo.DeleteThis@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Eric W. Biederman

External


Since: Nov 04, 2006
Posts: 795



(Msg. 15) Posted: Tue Nov 03, 2009 5:20 pm
Post subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups [Login to view extended thread Info.]
Archived from groups: per prev. post (more info?)

ebiederm.DeleteThis@xmission.com (Eric W. Biederman) writes:

> Benjamin LaHaise <bcrl.DeleteThis@lhnet.ca> writes:
>
>> On Mon, Nov 02, 2009 at 07:50:58PM -0800, Greg KH wrote:
>>> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
>>> > Use an rbtree in sysfs_dirent to speed up file lookup times
>>> >
>>> > Systems with large numbers (tens of thousands and more) of network
>>> > interfaces stress the sysfs code in ways that make the linear search for
>>> > a name match take far too long. Avoid this by using an rbtree.
>>>
>>> What kind of speedups are you seeing here? And do these changes cause a
>>> memory increase due to the structure changes which outweigh the
>>> speedups?
>>
>> Depends on the number of interfaces being created. Without the patch,
>> interface creation time tends to double or worse for every 5,000-10,000
>> additional network interfaces.
>>
>>> What kind of test are you doing to reproduce this?
>>
>> I'm creating 30,000+ network interfaces, with the goal being 100,000.
>> With other hacks in the tree to get around the sysctl and procfs scaling
>> issues, as well as disabling things like NetworkManager, the results look
>> as follows:
>>
>> Interfaces no-rb rbtree rbtree+list
>> 0-5,000 13.8s 14.0s 13.0s
>> 5,000-10,000 20.0s 17.4s 14.4s
>> 10,000-15,000 27.3s 24.1s 16.9s
>> 15,000-20,000 36.3s 32.2s 19.7s
>> 20,000-25,000 45.2s 40.0s 22.9s
>> 25,000-30,000 54.2s 48.2s 26.6s
>> 30,000-35,000 63.9s 54.9s 30.7s
>>
>> Thoughts?
>
> Something is very weird. I just took your no-rb numbers
> and divided by the number of interfaces to see how well we are
> scaling. I got:
>
> Interfaces per-interface cost
> 5,000 0.002760s
> 10,000 0.002000s
> 15,000 0.001820s
> 20,000 0.001815s
> 25,000 0.001808s
> 30,000 0.001807s
> 35,000 0.001826s
>
> I ran a variant of this test a long time ago and at that the
> cost per interface grew with additional interfaces added. This
> jives with the fact that the fundamental cost of adding N
> network interfaces to sysfs is O(N^2).
>
> Are your numbers from your application and are they real world?
> In which case they are interesting, but it would be good if
> we could also have microbenchmark numbers that just measure
> the sysfs costs. If nothing else I am seeing a big startup
> overhead that isn't being subtracted out that makes it hard
> to see the real costs here.

I guess in particular what I would expect is that if we can do 35000
interfaces in 63s with an O(N^2) algorithm. Then we should be able to
do 35000 interfaces with an O(NlogN) algorithm in under a second.
Which for your application should make the time essentially flat in
the number of interfaces.

Until we get close to that I figure we need to do more digging.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo.DeleteThis@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Back to top
Login to vote
Display posts from previous:   
Related Topics:
[PATCH 1/2] sysfs: Shadow directory support - The problem. When implementing a network namespace I need to be able to have multiple network devices with the same..

[PATCH 0/5] On to usable sysfs shadow directory support... - The following patchset has been tested on 2.6.21-rc6 + Kay's..

[PATCH] Documentation/rbtree.txt - Signed-off-by: Rob Landley <rob@landley.net> Documentation for lib/rbtree.c. -- I'm not an expert on this but ...

[RFC][PATCH -mm] swsusp: Use rbtree for tracking allocated.. - Hi, Some time ago we discussed the possibility of simplifying the swsusp's approach towards tracking the swap pages..

[PATCHSET 2.6.22-rc4-mm2] sysfs: make directory dentries/i.. - This patchset makes directory dentries and inodes reclaimable and is consisted of the following eleven patches. #01:..

[PATCHSET 2.6.22-rc2-mm1 REVIEW] sysfs: make directory den.. - Hello, again. THIS PATCHSET NEEDS MORE REVIEW AND TESTING. PLEASE DO NOT APPLY YET. This patchset makes directory..
       Soft32 Home -> Linux -> Kernel All times are: Pacific Time (US & Canada) (change)
Goto page 1, 2
Page 1 of 2

 
You can post new topics in this forum
You can reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum

Categories:
 Windows
  Linux
 Mac
 PDA


[ Contact us | Terms of Service/Privacy Policy ]