Skip to content

Commit

Permalink
Release 1.8
Browse files Browse the repository at this point in the history
  • Loading branch information
daviesrob committed Apr 3, 2018
2 parents 33b14f1 + ad7d585 commit ae6b0c5
Show file tree
Hide file tree
Showing 43 changed files with 1,762 additions and 206 deletions.
15 changes: 15 additions & 0 deletions INSTALL
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,21 @@ to an HTSlib source tree or installation in DIR; or --with-htslib=system
to use a system-installed HTSlib.


Optional Compilation with Perl
==============================

The '-i' and '-e' options can take external perl scripts for a more
sophisticated filtering. This option can be enabled by supplying the
--enable-perl-filters option to configure before running make:

./configure --enable-perl-filters

Note that enabling this option changes the license from MIT to GPL
because bcftools need to be built with

perl -MExtUtils::Embed -e ccopts -e ldopts


Optional Compilation with GSL
=============================

Expand Down
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ endif

include config.mk

PACKAGE_VERSION = 1.7
PACKAGE_VERSION = 1.8

# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
Expand Down Expand Up @@ -135,7 +135,7 @@ ifdef USE_GPL
endif

bcftools: $(OBJS) $(HTSLIB)
$(CC) $(DYNAMIC_FLAGS) -pthread $(ALL_LDFLAGS) -o $@ $(OBJS) $(HTSLIB_LIB) -lm $(ALL_LIBS) $(GSL_LIBS)
$(CC) $(DYNAMIC_FLAGS) -pthread $(ALL_LDFLAGS) -o $@ $(OBJS) $(HTSLIB_LIB) -lm $(ALL_LIBS) $(GSL_LIBS) $(PERL_LIBS)

# Plugin rules
ifneq "$(PLUGINS_ENABLED)" "no"
Expand Down Expand Up @@ -212,7 +212,8 @@ ccall.o: ccall.c $(htslib_kfunc_h) $(call_h) kmin.h $(prob1_h)
convert.o: convert.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(convert_h)
tsv2vcf.o: tsv2vcf.c $(tsv2vcf_h)
em.o: em.c $(htslib_vcf_h) kmin.h $(call_h)
filter.o: filter.c $(htslib_khash_str2int_h) $(filter_h) $(bcftools_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h)
filter.o: filter.c config.h $(htslib_khash_str2int_h) $(filter_h) $(bcftools_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h)
$(CC) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(PERL_CFLAGS) -c -o $@ $<
gvcf.o: gvcf.c gvcf.h $(call_h)
kmin.o: kmin.c kmin.h
mcall.o: mcall.c $(htslib_kfunc_h) $(call_h)
Expand Down
15 changes: 15 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
## Release 1.8 (April 2018)

* `-i, -e` filtering: Support for custom perl scripts

* `+contrast`: New plugin to annotate genotype differences between groups of samples

* `+fixploidy`: New options for simpler ploidy usage

* `+setGT`: Target genotypes can be set to phased by giving `--new-gt p`

* `run-roh.pl`: Allow to pass options directly to `bcftools roh`

* Number of bug fixes


## Release 1.7 (February 2018)

* `-i, -e` filtering: Major revamp, improved filtering by FORMAT fields
Expand Down
2 changes: 2 additions & 0 deletions config.mk.in
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ DYNAMIC_FLAGS = @CC_RDYNAMIC_OPT@

USE_GPL = @USE_GPL@
GSL_LIBS = @GSL_LIBS@
PERL_CFLAGS = @PERL_CCOPTS@
PERL_LIBS = @PERL_LIBS@

PLATFORM = @PLATFORM@
PLUGINS_ENABLED = @enable_bcftools_plugins@
Expand Down
19 changes: 19 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ AC_ARG_WITH([bcf-plugin-path],
[with_bcf_plugin_path=$with_bcf_plugin_dir])
AC_SUBST([PLUGINPATH], $with_bcf_plugin_path)

AC_ARG_ENABLE([perl-filters],
[AS_HELP_STRING([--enable-perl-filters],
[do not build support for PERL scripts in -i/-e filtering expressions])],
[], [enable_perl_filters=no])
AC_SUBST(enable_perl_filters)

AC_ARG_ENABLE([libgsl],
[AS_HELP_STRING([--enable-libgsl],
[build options that require the GNU scientific library (changes bcftools license to GPL, see INSTALL for details)])],
Expand Down Expand Up @@ -199,6 +205,19 @@ Either configure with --disable-libgsl or resolve this error to build bcftools.
AC_SUBST([GSL_LIBS])
])

AS_IF([test "$enable_perl_filters" != "no" ], [dnl
if ! perl -MExtUtils::Embed -e ccopts -e ldopts &>/dev/null ; then
AC_MSG_ERROR([Cannot `perl -MExtUtils::Embed -e ccopts -e ldopts`])
fi
AC_DEFINE([ENABLE_PERL_FILTERS], 1, [Define if BCFtools should enable for support PERL scripts in -i/-e filtering expressions.])
PERL_CCOPTS="`perl -MExtUtils::Embed -e ccopts`"
PERL_LIBS="`perl -MExtUtils::Embed -e ldopts`"
AC_SUBST([PERL_CCOPTS])
AC_SUBST([PERL_LIBS])
USE_GPL=1
AC_SUBST([USE_GPL])
])

AC_CONFIG_FILES([config.mk])
AC_OUTPUT

Expand Down
1 change: 1 addition & 0 deletions consensus.c
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ static void init_region(args_t *args, char *line)
else to--;
}
}
free(args->chr);
args->chr = strdup(line);
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
Expand Down
23 changes: 22 additions & 1 deletion convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -760,12 +760,33 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int

if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
if ( fmt_gt->n!=1 && fmt_gt->n!=2 )
error("Uh, ploidy of %d not supported, see %s:%d\n", fmt_gt->n, bcf_seqname(convert->header, line), line->pos+1);

int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
ptr += fmt_gt->n;
if ( ptr[0]==2 )
if ( fmt_gt->n==1 ) // haploid genotypes
{
if ( ptr[0]==2 ) /* 0 */
{
str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
else if ( ptr[0]==bcf_int32_missing ) /* . */
{
str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
else if ( ptr[0]==4 ) /* 1 */
{
str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
else
{
kputw(bcf_gt_allele(ptr[0]),str); str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
}
else if ( ptr[0]==2 )
{
if ( ptr[1]==3 ) /* 0|0 */
{
Expand Down
2 changes: 1 addition & 1 deletion csq.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
A .. gene line with a supported biotype
A.ID=~/^gene:/
B .. transcript line referencing A
B .. transcript line referencing A with supported biotype
B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
C .. corresponding CDS, exon, and UTR lines:
Expand Down
120 changes: 96 additions & 24 deletions doc/bcftools.1
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
.\" Date: 2018-02-12
.\" Date: 2018-04-03
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "BCFTOOLS" "1" "2018\-02\-12" "\ \&" "\ \&"
.TH "BCFTOOLS" "1" "2018\-04\-03" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
Expand Down Expand Up @@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
.SS "VERSION"
.sp
This manual page was last updated \fB2018\-02\-12\fR and refers to bcftools git version \fB1\&.7\fR\&.
This manual page was last updated \fB2018\-04\-03\fR and refers to bcftools git version \fB1\&.8\fR\&.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
Expand Down Expand Up @@ -1796,8 +1796,45 @@ reference sequence in fasta format (required)
\fB\-g, \-\-gff\-annot\fR \fIFILE\fR
.RS 4
GFF3 annotation file (required), such as
ftp://ftp\&.ensembl\&.org/pub/current_gff3/homo_sapiens/
ftp://ftp\&.ensembl\&.org/pub/current_gff3/homo_sapiens\&. An example of a minimal working GFF file:
.RE
.sp
.if n \{\
.RS 4
.\}
.nf
# The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines,
# looks up their parent transcript (determined from the "Parent=transcript:" attribute),
# the gene (determined from the transcript\*(Aqs "Parent=gene:" attribute), and the biotype
# (the most interesting is "protein_coding")\&.
#
# Attributes required for
# gene lines:
# \- ID=gene:<gene_id>
# \- biotype=<biotype>
# \- Name=<gene_name> [optional]
#
# transcript lines:
# \- ID=transcript:<transcript_id>
# \- Parent=gene:<gene_id>
# \- biotype=<biotype>
#
# other lines (CDS, exon, five_prime_UTR, three_prime_UTR):
# \- Parent=transcript:<transcript_id>
#
# Supported biotypes:
# \- see the function gff_parse_biotype() in bcftools/csq\&.c

1 ignored_field gene 21 2148 \&. \- \&. ID=gene:GeneId;biotype=protein_coding;Name=GeneName
1 ignored_field transcript 21 2148 \&. \- \&. ID=transcript:TranscriptId;Parent=gene:GeneId;biotype=protein_coding
1 ignored_field three_prime_UTR 21 2054 \&. \- \&. Parent=transcript:TranscriptId
1 ignored_field exon 21 2148 \&. \- \&. Parent=transcript:TranscriptId
1 ignored_field CDS 21 2148 \&. \- 1 Parent=transcript:TranscriptId
1 ignored_field five_prime_UTR 210 2148 \&. \- \&. Parent=transcript:TranscriptId
.fi
.if n \{\
.RE
.\}
.PP
\fB\-i, \-\-include\fR \fIEXPRESSION\fR
.RS 4
Expand Down Expand Up @@ -2096,7 +2133,7 @@ see
.RE
.SS "bcftools gtcheck [\fIOPTIONS\fR] [\-g \fIgenotypes\&.vcf\&.gz\fR] \fIquery\&.vcf\&.gz\fR"
.sp
Checks sample identity or, without \fB\-g\fR, multi\-sample cross\-check is performed\&.
Checks sample identity\&. The program can operate in two modes\&. If the \fB\-g\fR option is given, the identity of the \fB\-s\fR sample from \fIquery\&.vcf\&.gz\fR is checked against the samples in the \fB\-g\fR file\&. Without the \fB\-g\fR option, multi\-sample cross\-check of samples in \fIquery\&.vcf\&.gz\fR is performed\&.
.PP
\fB\-a, \-\-all\-sites\fR
.RS 4
Expand Down Expand Up @@ -2227,21 +2264,14 @@ is used for the unseen genotypes\&. With
can be used instead; the discordance value then gives exactly the number of differing genotypes\&.
.RE
.PP
SM, Average Discordance
.RS 4
Average discordance between sample
\fIa\fR
and all other samples\&.
.RE
.PP
SM, Average Depth
ERR, error rate
.RS 4
Average depth at evaluated sites, or 1 if FORMAT/DP field is not present\&.
Pairwise error rate calculated as number of differences divided by the total number of comparisons\&.
.RE
.PP
SM, Average Number of sites
CLUSTER, TH, DOT
.RS 4
The average number of sites used to calculate the discordance\&. In other words, the average number of non\-missing PLs/genotypes seen both samples\&.
In presence of multiple samples, related samples and outliers can be identified by clustering samples by error rate\&. A simple hierarchical clustering based on minimization of standard deviation is used\&. This is useful to detect sample swaps, for example in situations where one sample has been sequenced in multiple runs\&.
.RE
.RE
.SS "bcftools index [\fIOPTIONS\fR] \fIin\&.bcf\fR|\fIin\&.vcf\&.gz\fR"
Expand Down Expand Up @@ -2986,7 +3016,7 @@ and
can swap alleles and will update genotypes (GT) and AC counts, but will not attempt to fix PL or other fields\&.
.RE
.PP
\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIany\fR
\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIall\fR|\fInone\fR
.RS 4
If a record is present multiple times, output only the first instance, see
\fB\-\-collapse\fR
Expand All @@ -2997,8 +3027,7 @@ in
\fB\-D, \-\-remove\-duplicates\fR
.RS 4
If a record is present in multiple files, output only the first instance\&. Alias for
\fB\-d none\fR\&. Requires
\fB\-a, \-\-allow\-overlaps\fR\&.
\fB\-d none\fR, deprecated\&.
.RE
.PP
\fB\-f, \-\-fasta\-ref\fR \fIFILE\fR
Expand Down Expand Up @@ -3864,12 +3893,28 @@ nor the other
options are given, the allele frequency is estimated from AC and AN counts which are already present in the INFO field\&.
.RE
.PP
\fB\-\-exclude\fR \fIEXPRESSION\fR
.RS 4
exclude sites for which
\fIEXPRESSION\fR
is true\&. For valid expressions see
\fBEXPRESSIONS\fR\&.
.RE
.PP
\fB\-G, \-\-GTs\-only\fR \fIFLOAT\fR
.RS 4
use genotypes (FORMAT/GT fields) ignoring genotype likelihoods (FORMAT/PL), setting PL of unseen genotypes to
\fIFLOAT\fR\&. Safe value to use is 30 to account for GT errors\&.
.RE
.PP
\fB\-\-include\fR \fIEXPRESSION\fR
.RS 4
include only sites for which
\fIEXPRESSION\fR
is true\&. For valid expressions see
\fBEXPRESSIONS\fR\&.
.RE
.PP
\fB\-I, \-\-skip\-indels\fR
.RS 4
skip indels as their genotypes are usually enriched for errors
Expand Down Expand Up @@ -4680,22 +4725,26 @@ TYPE!~"snp"
.sp -1
.IP \(bu 2.3
.\}
array subscripts (0\-based), "*" for any field, "\-" to indicate a range\&. Note that for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield
array subscripts (0\-based), "*" for any element, "\-" to indicate a range\&. Note that for querying FORMAT vectors, the colon ":" can be used to select a sample and an element of the vector, as shown in the examples below
.sp
.if n \{\
.RS 4
.\}
.nf
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0\&.3
DP4[*] == 0
CSQ[*] ~ "missense_variant\&.*deleterious"
INFO/AF[0] > 0\&.3 \&.\&. first AF value bigger than 0\&.3
FORMAT/AD[0:0] > 30 \&.\&. first AD value of the first sample bigger than 30
FORMAT/AD[0:1] \&.\&. first sample, second AD value
FORMAT/AD[1:0] \&.\&. second sample, first AD value
DP4[*] == 0 \&.\&. any DP4 value
FORMAT/DP[0] > 30 \&.\&. DP of the first sample bigger than 30
FORMAT/DP[1\-3] > 10 \&.\&. samples 2\-4
FORMAT/DP[1\-] < 7 \&.\&. all samples but the first
FORMAT/DP[0,2\-4] > 20 \&.\&. samples 1, 3\-5
FORMAT/AD[0:1] \&.\&. first sample, second AD field
FORMAT/AD[0:*], AD[0:] or AD[0] \&.\&. first sample, any AD field
FORMAT/AD[*:1] or AD[:1] \&.\&. any sample, second AD field
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0\&.3
CSQ[*] ~ "missense_variant\&.*deleterious"
.fi
.if n \{\
.RE
Expand Down Expand Up @@ -4743,6 +4792,29 @@ N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING
.RE
.\}
.RE
.sp
.RS 4
.ie n \{\
\h'-04'\(bu\h'+03'\c
.\}
.el \{\
.sp -1
.IP \(bu 2.3
.\}
custom perl filtering\&. Note that this command is not compiled in by default, see the section
\fBOptional Compilation with Perl\fR
in the INSTALL file for help and misc/demo\-flt\&.pl for a working example\&. The demo defined the perl subroutine "severity" which can be invoked from the command line as follows:
.sp
.if n \{\
.RS 4
.\}
.nf
perl:path/to/script\&.pl; perl\&.severity(INFO/CSQ) > 3
.fi
.if n \{\
.RE
.\}
.RE
.PP
\fBNotes:\fR
.sp
Expand Down Expand Up @@ -4776,7 +4848,7 @@ Variables and function names are case\-insensitive, but not tag names\&. For exa
.sp -1
.IP \(bu 2.3
.\}
When querying multiple subfields, all subfields are tested and the OR logic is used on the result\&. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
When querying multiple values, all elements are tested and the OR logic is used on the result\&. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
.sp
.if n \{\
.RS 4
Expand Down
Loading

0 comments on commit ae6b0c5

Please sign in to comment.