1
0
mirror of git://git.sv.gnu.org/coreutils.git synced 2026-04-12 06:57:33 +02:00

rm -r: avoid O(n^2) performance for a directory with very many entries

This enhancement works around a problem that is specific to at least
ext3 and ext4 file systems.  With them, it would take hours to remove
a two-million-entry directory.  RAM-backed file systems (tmpfs) are
not affected, since there is no seek penalty.
* remove.c (rm_malloc, rm_free, compare_ino): New functions.
(dirent_count, preprocess_dir): New function.
[struct readdir_data]: New struct.
(remove_cwd_entries): Call preprocess_dir.
* tests/rm/ext3-perf: New file.  Test for the performance fix.
* NEWS: mention the new feature
This commit is contained in:
Jim Meyering
2008-09-22 22:42:12 +02:00
parent a5111af33e
commit 24412edeaf
4 changed files with 249 additions and 0 deletions

7
NEWS
View File

@@ -9,6 +9,13 @@ GNU coreutils NEWS -*- outline -*-
** New features
chgrp, chmod, chown, chcon, du, rm: now all display linear performance,
even when operating on million-entry directories on ext3 and ext4 file
systems. Before, they would exhibit O(N^2) performance, due to linear
per-entry seek time cost when operating on entries in readdir order.
Rm was improved directly, while the others inherit the improvement
from the newer version of fts in gnulib.
comm now verifies that the inputs are in sorted order. This check can
be turned off with the --nocheck-order option.

View File

@@ -60,6 +60,14 @@ enum
CONSECUTIVE_READDIR_UNLINK_THRESHOLD = 10
};
/* If the heuristics in preprocess_dir suggest that there
are fewer than this many entries in a directory, then it
skips the preprocessing altogether. */
enum
{
INODE_SORT_DIR_ENTRIES_THRESHOLD = 10000
};
/* FIXME: in 2009, or whenever Darwin 7.9.0 (aka MacOS X 10.3.9) is no
longer relevant, remove this work-around code. Then, there will be
no need to perform the extra rewinddir call, ever. */
@@ -217,6 +225,24 @@ hash_compare_strings (void const *x, void const *y)
return STREQ (x, y) ? true : false;
}
/* Obstack allocator: longjump on failure. */
static void *
rm_malloc (void *v_jumpbuf, long size)
{
jmp_buf *jumpbuf = v_jumpbuf;
void *p = malloc (size);
if (p)
return p;
longjmp (*jumpbuf, 1);
}
/* With the 2-arg allocator, we must also provide a two-argument freer. */
static void
rm_free (void *v_jumpbuf ATTRIBUTE_UNUSED, void *ptr)
{
free (ptr);
}
static inline void
push_dir (Dirstack_state *ds, const char *dir_name)
{
@@ -1225,6 +1251,141 @@ fd_to_subdirp (int fd_cwd, char const *f,
return NULL;
}
struct readdir_data
{
ino_t ino;
char name[FLEXIBLE_ARRAY_MEMBER];
};
/* A comparison function to sort on increasing inode number. */
static int
compare_ino (void const *av, void const *bv)
{
struct readdir_data const *const *a = av;
struct readdir_data const *const *b = bv;
return (a[0]->ino < b[0]->ino ? -1
: b[0]->ino < a[0]->ino ? 1 : 0);
}
/* Return an approximation of the maximum number of dirent entries
in a directory with stat data *ST. */
static size_t
dirent_count (struct stat const *st)
{
return st->st_size / 16;
}
/* When a directory contains very many entries, operating on N entries in
readdir order can be very seek-intensive (be it to unlink or even to
merely stat each entry), to the point that it results in O(N^2) work.
This is file-system-specific: ext3 and ext4 (as of 2008) are susceptible,
but tmpfs is not. The general solution is to process entries in inode
order. That means reading all entries, and sorting them before operating
on any. As such, it is useful only on systems with useful dirent.d_ino.
Since 'rm -r's removal process must traverse into directories and since
this preprocessing phase can allocate O(N) storage, here we store and
sort only non-directory entries, and then remove all of those, so that we
can free all allocated storage before traversing into any subdirectory.
Perform this optimization only when not interactive and not in verbose
mode, to keep the implementation simple and to minimize duplication.
Upon failure, simply free any resources and return. */
static void
preprocess_dir (DIR **dirp, struct rm_options const *x)
{
#if HAVE_STRUCT_DIRENT_D_TYPE
struct stat st;
/* There are many reasons to return right away, skipping this
optimization. The penalty for being wrong is that we will
perform a small amount of extra work.
Skip this optimization if... */
if (/* - there is a chance of interactivity */
x->interactive != RMI_NEVER
/* - we're in verbose mode */
|| x->verbose
/* - privileged users can unlink nonempty directories.
Otherwise, there'd be a race condition between the readdir
call (in which we learn dirent.d_type) and the unlink, by
which time the non-directory may be replaced with a directory. */
|| ! cannot_unlink_dir ()
/* - we can't fstat the file descriptor */
|| fstat (dirfd (*dirp), &st) != 0
/* - the directory is smaller than some threshold.
Estimate the number of inodes with a heuristic.
There's no significant benefit to sorting if there are
too few inodes. */
|| dirent_count (&st) < INODE_SORT_DIR_ENTRIES_THRESHOLD)
return;
/* FIXME: maybe test file system type, too; skip if it's tmpfs: see fts.c */
struct obstack o_readdir_data; /* readdir data: inode,name pairs */
struct obstack o_p; /* an array of pointers to each inode,name pair */
/* Arrange to longjmp upon obstack allocation failure. */
jmp_buf readdir_jumpbuf;
if (setjmp (readdir_jumpbuf))
goto cleanup;
obstack_init_minimal (&o_readdir_data);
obstack_init_minimal (&o_p);
obstack_specify_allocation_with_arg (&o_readdir_data, 0, 0,
rm_malloc, rm_free, &readdir_jumpbuf);
obstack_specify_allocation_with_arg (&o_p, 0, 0,
rm_malloc, rm_free, &readdir_jumpbuf);
/* Read all entries, storing <d_ino, d_name> for each non-dir one.
Maintain a parallel list of pointers into the primary buffer. */
while (1)
{
struct dirent const *dp;
dp = readdir_ignoring_dot_and_dotdot (*dirp);
/* no need to distinguish EOF from failure */
if (dp == NULL)
break;
/* Skip known-directory and type-unknown entries. */
if (D_TYPE (dp) == DT_UNKNOWN || D_TYPE (dp) == DT_DIR)
break;
size_t name_len = strlen (dp->d_name);
size_t ent_len = offsetof (struct readdir_data, name) + name_len + 1;
struct readdir_data *v = obstack_alloc (&o_readdir_data, ent_len);
v->ino = D_INO (dp);
memcpy (v->name, dp->d_name, name_len + 1);
/* Append V to the list of pointers. */
obstack_ptr_grow (&o_p, v);
}
/* Compute size and finalize VV. */
size_t n = obstack_object_size (&o_p) / sizeof (void *);
struct readdir_data **vv = obstack_finish (&o_p);
/* Sort on inode number. */
qsort(vv, n, sizeof *vv, compare_ino);
/* Iterate through those pointers, unlinking each name. */
for (size_t i = 0; i < n; i++)
{
/* ignore failure */
(void) unlinkat (dirfd (*dirp), vv[i]->name, 0);
}
cleanup:
obstack_free (&o_readdir_data, NULL);
obstack_free (&o_p, NULL);
rewinddir (*dirp);
#endif
}
/* Remove entries in the directory open on DIRP
Upon finding a directory that is both non-empty and that can be chdir'd
into, return RM_OK and set *SUBDIR and fill in SUBDIR_SB, where
@@ -1247,6 +1408,10 @@ remove_cwd_entries (DIR **dirp,
assert (VALID_STATUS (status));
*subdir = NULL;
/* This is just an optimization.
It's not a fatal problem if it fails. */
preprocess_dir (dirp, x);
while (1)
{
struct dirent const *dp;

View File

@@ -72,6 +72,7 @@ EXTRA_DIST += $(TESTS)
TESTS = \
misc/help-version \
misc/invalid-opt \
rm/ext3-perf \
rm/cycle \
chmod/no-x \
chgrp/basic \

76
tests/rm/ext3-perf Executable file
View File

@@ -0,0 +1,76 @@
#!/bin/sh
# ensure that "rm -rf DIR-with-many-entries" is not O(N^2)
# Copyright (C) 2008 Free Software Foundation, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
if test "$VERBOSE" = yes; then
set -x
rm --version
fi
. $srcdir/test-lib.sh
expensive_
# Using rm -rf to remove a 400k-entry directory takes:
# - 9 seconds with the patch, on a 2-yr-old system
# - 350 seconds without the patch, on a high-end system (disk 20-30% faster)
threshold_seconds=60
# The number of entries in our test directory.
n=400000
# Choose a value that is large enough to ensure an accidentally
# regressed rm would require much longer than $threshold_seconds to remove
# the directory. With n=400k, pre-patch GNU rm would require about 350
# seconds even on a fast disk. On a relatively modern system, the
# patched version of rm requires about 10 seconds, so even if you
# choose to enable very expensive tests with a disk that is much slower,
# the test should still succeed.
# Skip unless "." is on an ext[34] file system.
# FIXME-maybe: try to find a suitable file system or allow
# the user to specify it via an envvar.
df -t ext3 -t ext4dev -t ext4 . \
|| skip_test_ 'this test runs only on an ext3 or ext4 file system'
# Skip if there are too few inodes free. Require some slack.
free_inodes=$(stat -f --format=%d .) || framework_failure
min_free_inodes=$(expr 12 \* $n / 10)
test $min_free_inodes -lt $free_inodes \
|| skip_test_ "too few free inodes on '.': $free_inodes;" \
"this test requires at least $min_free_inodes"
ok=0
mkdir d &&
cd d &&
seq $n | xargs touch &&
test -f 1 &&
test -f $n &&
cd .. &&
ok=1
test $ok = 1 || framework_failure
fail=0
start=$(date +%s)
rm -rf d || fail=1
duration=$(expr $(date +%s) - $start)
test $duration -lt $threshold_seconds ||
{ fail=1; echo rm took longer than $threshold_seconds seconds; }
echo removing a $n-entry directory took $duration seconds
Exit $fail