Re: Interactions with mxge, pf, nfsd, and the kernel
I've been running one of these machines without pf, and it has ceased
responding on all interfaces (mxge and bce). The console still works
fine, and a reboot will clear the problems for now. I'm running out of
ideas.
root@helo:~ # netstat -i
Name Mtu Network Address Ipkts Ierrs Idrop
Opkts Oerrs Coll
mxge0 9000 <Link#1> 00:60:dd:44:d2:07 44838061 164399 0
31944144 0 0
mxge0 9000 fe80::260:ddf fe80::260:ddff:fe 0 - -
3 - -
bce0 1500 <Link#2> 08:9e:01:50:a3:08 97018 0 0
0 0 0
bce0 1500 fe80::a9e:1ff fe80::a9e:1ff:fe5 0 - -
3 - -
bce1 1500 <Link#3> 08:9e:01:50:a3:09 889442915 1791 0
557044449 0 0
bce1 1500 128.113.12.0 helo 888129846 - -
676300451 - -
bce1 1500 fe80::a9e:1ff fe80::a9e:1ff:fe5 0 - -
4 - -
lo0 16384 <Link#4> 28448 0 0
28448 0 0
lo0 16384 localhost ::1 59 - -
59 - -
lo0 16384 fe80::1%lo0 fe80::1 0 - -
0 - -
lo0 16384 your-net localhost 28389 - -
28389 - -
vlan2 9000 <Link#5> 00:60:dd:44:d2:07 28107520 0 0
19859118 0 0
vlan2 9000 10.2.3.0 helo.galactica.lo 28088754 - -
24433917 - -
vlan2 9000 fe80::260:ddf fe80::260:ddff:fe 0 - -
3 - -
vlan2 9000 <Link#6> 00:60:dd:44:d2:07 16730541 0 0
12084894 0 0
vlan2 9000 10.2.4.0 helo.enterprise.l 16724370 - -
12924742 - -
vlan2 9000 fe80::260:ddf fe80::260:ddff:fe 0 - -
3 - -
root@helo:~ # netstat -m
7632/6798/14430 mbufs in use (current/cache/total)
4186/2886/7072/1018944 mbuf clusters in use (current/cache/total/max)
4080/1420 mbuf+clusters out of packet secondary zone in use (current/cache)
0/6/6/509472 4k (page size) jumbo clusters in use (current/cache/total/max)
593/25/618/150954 9k jumbo clusters in use (current/cache/total/max)
0/0/0/84912 16k jumbo clusters in use (current/cache/total/max)
15617K/7720K/23337K bytes allocated to network (current/cache/total)
3/72461/0 requests for mbufs denied (mbufs/clusters/mbuf+clusters)
0/0/0 requests for mbufs delayed (mbufs/clusters/mbuf+clusters)
0/0/0 requests for jumbo clusters delayed (4k/9k/16k)
122/391912/0 requests for jumbo clusters denied (4k/9k/16k)
0 requests for sfbufs denied
0 requests for sfbufs delayed
0 requests for I/O initiated by sendfile
root@helo:~ # uptime
9:07AM up 12 days, 8:15, 1 user, load averages: 0.19, 0.19, 0.20
root@helo:~ # ifconfig
mxge0: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> metric 0 mtu 9000
options=6c03bb<RXCSUM,TXCSUM,VLAN_MTU,VLAN_HWTAGGING,JUMBO_MTU,VLAN_HWCSUM,TSO4,TSO6,VLAN_HWTSO,LINKSTATE,RXCSUM_IPV6,TXCSUM_IPV6>
ether 00:60:dd:44:d2:07
inet6 fe80::260:ddff:fe44:d207%mxge0 prefixlen 64 scopeid 0x1
nd6 options=29<PERFORMNUD,IFDISABLED,AUTO_LINKLOCAL>
media: Ethernet 10Gbase-CX4 <full-duplex>
status: active
bce0: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> metric 0 mtu 1500
options=c01bb<RXCSUM,TXCSUM,VLAN_MTU,VLAN_HWTAGGING,JUMBO_MTU,VLAN_HWCSUM,TSO4,VLAN_HWTSO,LINKSTATE>
ether 08:9e:01:50:a3:08
inet6 fe80::a9e:1ff:fe50:a308%bce0 prefixlen 64 scopeid 0x2
nd6 options=29<PERFORMNUD,IFDISABLED,AUTO_LINKLOCAL>
media: Ethernet autoselect (1000baseT <full-duplex>)
status: active
bce1: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> metric 0 mtu 1500
options=c01bb<RXCSUM,TXCSUM,VLAN_MTU,VLAN_HWTAGGING,JUMBO_MTU,VLAN_HWCSUM,TSO4,VLAN_HWTSO,LINKSTATE>
ether 08:9e:01:50:a3:09
inet 128.113.12.134 netmask 0xffffff00 broadcast 128.113.12.255
inet6 fe80::a9e:1ff:fe50:a309%bce1 prefixlen 64 scopeid 0x3
nd6 options=29<PERFORMNUD,IFDISABLED,AUTO_LINKLOCAL>
media: Ethernet autoselect (1000baseT <full-duplex,master>)
status: active
lo0: flags=8049<UP,LOOPBACK,RUNNING,MULTICAST> metric 0 mtu 16384
options=600003<RXCSUM,TXCSUM,RXCSUM_IPV6,TXCSUM_IPV6>
inet6 ::1 prefixlen 128
inet6 fe80::1%lo0 prefixlen 64 scopeid 0x4
inet 127.0.0.1 netmask 0xff000000
nd6 options=21<PERFORMNUD,AUTO_LINKLOCAL>
vlan23: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> metric 0 mtu 9000
options=303<RXCSUM,TXCSUM,TSO4,TSO6>
ether 00:60:dd:44:d2:07
inet 10.2.3.244 netmask 0xffffff00 broadcast 10.2.3.255
inet6 fe80::260:ddff:fe44:d207%vlan23 prefixlen 64 scopeid 0x5
nd6 options=29<PERFORMNUD,IFDISABLED,AUTO_LINKLOCAL>
media: Ethernet 10Gbase-CX4 <full-duplex>
status: active
vlan: 23 parent interface: mxge0
vlan24: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> metric 0 mtu 9000
options=303<RXCSUM,TXCSUM,TSO4,TSO6>
ether 00:60:dd:44:d2:07
inet 10.2.4.244 netmask 0xffffff00 broadcast 10.2.4.255
inet6 fe80::260:ddff:fe44:d207%vlan24 prefixlen 64 scopeid 0x6
nd6 options=29<PERFORMNUD,IFDISABLED,AUTO_LINKLOCAL>
media: Ethernet 10Gbase-CX4 <full-duplex>
status: active
vlan: 24 parent interface: mxge0
rc.conf:
hostname="helo.bio.rpi.edu"
ifconfig_bce1=" inet 128.113.12.134 netmask 0xffffff00"
ifconfig_mxge0="up mtu 9000"
ifconfig_bce0="up"
cloned_interfaces="vlan23 vlan24"
ifconfig_vlan23="inet 10.2.3.244 netmask 255.255.255.0 vlan 23 vlandev
mxge0"
ifconfig_vlan24="inet 10.2.4.244 netmask 255.255.255.0 vlan 24 vlandev
mxge0"
defaultrouter="128.113.12.254"
sshd_enable="YES"
ntpd_enable="YES"
powerd_enable="YES"
# Set dumpdev to "AUTO" to enable crash dumps, "NO" to disable
dumpdev="NO"
zfs_enable="YES"
nisdomainname="GALACTICA.BIO.RPI.EDU"
ntpdate_enable="YES"
ntpdate_hosts="ntp.rpi.edu"
rpc_lockd_enable="YES"
rpc_statd_enable="YES"
rpcbind_enable="YES"
nis_client_enable="YES"
nis_client_flags="-m -S GALACTICA.BIO.RPI.EDU,adama.galactica.local"
nfs_server_enable="YES"
mountd_enable="YES"
nfsd_enable="YES"
apcupsd_enable="YES"
#pf_enable="YES"
netwait_enable="YES"
netwait_ip="128.113.12.254"
netwait_if="mxge0"
static_routes="management"
route_management="-net 10.1.1.0/24 10.2.3.254"
amd_enable="YES" # Run amd service with $amd_flags (or NO).
amd_flags="-a /.amd_mnt -l syslog /home amd.home"
amd_map_program="NO" # Can be set to "ypcat -k amd.master"
root@helo:~ # uname -a
FreeBSD helo.bio.rpi.edu 10.0-RELEASE-p4 FreeBSD 10.0-RELEASE-p4 #0: Tue
Jun 3 13:14:57 UTC 2014
root@amd64-builder.daemonology.net:/usr/obj/usr/src/sys/GENERIC amd64
Bob Healey
Systems Administrator
Biocomputation and Bioinformatics Constellation
and Molecularium
healer@rpi.edu
(518) 276-4407
On 7/2/2014 11:11 AM, Bob Healey wrote:
> Hello.
>
> I've been wrestling with this on and off for a few months now. I have
> an assortment of systems (some Dell Poweredge R515, R610, and IBM
> x3630M3) with 10 gig Myricom ethernet cards acting as nfs servers to
> Linux HPC compute clusters (12-36 nodes, 384 - 480 cores) connected
> via gigabit ethernet. They are also connected to the outside world
> via onboard bce (Dell) or igb (IBM). After a variable length of time,
> I will lose all network access to a host. Connecting via console, the
> machine tends to be fully responsive. A reboot clears the problem, but
> I have yet to figure out any sysctls/loader.conf tunables to clear the
> problem and make it stay away. PF is in use to restrict access to the
> host to a pair of public /24's, and to 10/8. If there is a way in
> zfs's sharenfs property to make that restriction, I'd be happy to
> change, but I really don't like leaving nfs open to the university's
> quartet of /16's, so PF it is. The vlan2 interface has mxge0 as its
> parent.
>
> Thanks for any help.
>
> This host is getting ready to crash soon, based on netstat.
> root@husker:~ # netstat -i
> Name Mtu Network Address Ipkts Ierrs Idrop Opkts
> Oerrs Coll
> mxge0 9000 <Link#1> 00:60:dd:44:d2:0a 6358280 262 0
> 4061637 0 0
> mxge0 9000 fe80::260:ddf fe80::260:ddff:fe 0 - -
> 2 - -
> bce0 1500 <Link#2> 08:9e:01:50:a1:ac 276391 0 0
> 0 0 0
> bce0 1500 fe80::a9e:1ff fe80::a9e:1ff:fe5 0 - -
> 3 - -
> bce1 1500 <Link#3> 08:9e:01:50:a1:ad 2229709391 16921 0
> 1182942116 0 0
> bce1 1500 128.113.12.0 husker 2226254093 - -
> 1183962005 - -
> bce1 1500 fe80::a9e:1ff fe80::a9e:1ff:fe5 0 - -
> 3 - -
> lo0 16384 <Link#4> 2030 0 0
> 2030 0 0
> lo0 16384 localhost ::1 4 - -
> 4 - -
> lo0 16384 fe80::1%lo0 fe80::1 0 - -
> 0 - -
> lo0 16384 your-net localhost 2026 - -
> 2026 - -
> vlan2 9000 <Link#5> 00:60:dd:44:d2:0a 4387250 0 0
> 3060586 0 0
> vlan2 9000 10.2.3.0 husker.galactica. 4370309 - -
> 3963931 - -
> vlan2 9000 fe80::260:ddf fe80::260:ddff:fe 0 - -
> 2 - -
> vlan2 9000 <Link#6> 00:60:dd:44:d2:0a 1971034 0 0
> 1001061 0 0
> vlan2 9000 10.2.4.0 husker.enterprise 1700742 - -
> 1961891 - -
> vlan2 9000 fe80::260:ddf fe80::260:ddff:fe 0 - -
> 4 - -
> root@husker:~ # netstat -im
> 6157/3233/9390 mbufs in use (current/cache/total)
> 4081/1883/5964/1018800 mbuf clusters in use (current/cache/total/max)
> 4080/795 mbuf+clusters out of packet secondary zone in use
> (current/cache)
> 0/5/5/509399 4k (page size) jumbo clusters in use
> (current/cache/total/max)
> 512/23/535/150933 9k jumbo clusters in use (current/cache/total/max)
> 0/0/0/84899 16k jumbo clusters in use (current/cache/total/max)
> 14309K/4801K/19110K bytes allocated to network (current/cache/total)
> 10/1883/0 requests for mbufs denied (mbufs/clusters/mbuf+clusters)
> 0/0/0 requests for mbufs delayed (mbufs/clusters/mbuf+clusters)
> 0/0/0 requests for jumbo clusters delayed (4k/9k/16k)
> 2/1736/0 requests for jumbo clusters denied (4k/9k/16k)
> 0 requests for sfbufs denied
> 0 requests for sfbufs delayed
> 0 requests for I/O initiated by sendfile
> root@husker:~ # uptime
> 11:07AM up 23 days, 19:27, 1 user, load averages: 0.14, 0.17, 0.13
> root@husker:~ # sysctl -a | grep nmb
> kern.ipc.nmbclusters: 1018800
> kern.ipc.nmbjumbop: 509399
> kern.ipc.nmbjumbo9: 452799
> kern.ipc.nmbjumbo16: 339596
> kern.ipc.nmbufs: 6520320
> root@husker:~ # cat /boot/loader.conf
> zfs_load="YES"
> amdtemp_load="YES"
> if_mxge_load="YES"
> mxge_ethp_z8e_load="YES"
> mxge_eth_z8e_load="YES"
> mxge_rss_ethp_z8e_load="YES"
> mxge_rss_eth_z8e_load="YES"
> vfs.zfs.arc_max="12288M"
> root@husker:~ # cat /var/run/dmesg.boot | head -16
> Copyright (c) 1992-2014 The FreeBSD Project.
> Copyright (c) 1979, 1980, 1983, 1986, 1988, 1989, 1991, 1992, 1993, 1994
> The Regents of the University of California. All rights reserved.
> FreeBSD is a registered trademark of The FreeBSD Foundation.
> FreeBSD 10.0-RELEASE-p4 #0: Tue Jun 3 13:14:57 UTC 2014
> root@amd64-builder.daemonology.net:/usr/obj/usr/src/sys/GENERIC amd64
> FreeBSD clang version 3.3 (tags/RELEASE_33/final 183502) 20130610
> CPU: AMD Opteron(tm) Processor 4122 (2200.07-MHz K8-class CPU)
> Origin = "AuthenticAMD" Id = 0x100f80 Family = 0x10 Model = 0x8
> Stepping = 0
> Features=0x178bfbff<FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CLFLUSH,MMX,FXSR,SSE,SSE2,HTT>
>
> Features2=0x802009<SSE3,MON,CX16,POPCNT>
> AMD
> Features=0xee500800<SYSCALL,NX,MMX+,FFXSR,Page1GB,RDTSCP,LM,3DNow!+,3DNow!>
> AMD
> Features2=0x837ff<LAHF,CMP,SVM,ExtAPIC,CR8,ABM,SSE4A,MAS,Prefetch,OSVW,IBS,SKINIT,WDT,NodeId>
> TSC: P-state invariant
> real memory = 17179869184 (16384 MB)
> avail memory = 16588054528 (15819 MB)
>
>
_______________________________________________
freebsd-stable@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-stable
To unsubscribe, send any mail to "freebsd-stable-unsubscribe@freebsd.org"
討論串 (同標題文章)
完整討論串 (本文為第 12 之 17 篇):