Skip to content

Commit

Permalink
Release 1.21
Browse files Browse the repository at this point in the history
  • Loading branch information
daviesrob committed Sep 12, 2024
2 parents 02ee548 + b411171 commit bcf6c62
Show file tree
Hide file tree
Showing 142 changed files with 4,543 additions and 906 deletions.
7 changes: 5 additions & 2 deletions .cirrus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ gcc_task:
LC_ALL: C
CIRRUS_CLONE_DEPTH: 1
HTSDIR: ./htslib
CFLAGS: -fsanitize=address
LDFLAGS: -fsanitize=address

matrix:
- environment:
Expand All @@ -82,6 +84,7 @@ ubuntu_task:

environment:
CC: clang
CFLAGS: -g -O2 -Werror -Wall -Wformat -Wformat=2
LC_ALL: C
CIRRUS_CLONE_DEPTH: 1
HTSDIR: ./htslib
Expand All @@ -93,8 +96,8 @@ ubuntu_task:
memory: 2G
environment:
USE_CONFIG: yes
CFLAGS: -g -Wall -O3 -fsanitize=address
LDFLAGS: -fsanitize=address -Wl,-rpath,`pwd`/inst/lib
CFLAGS: -g -Wall -O3
LDFLAGS: -Wl,-rpath,`pwd`/inst/lib

# NB: we could consider building a docker image with these
# preinstalled and specifying that instead, to speed up testing.
Expand Down
2 changes: 1 addition & 1 deletion HMM.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ double *hmm_get_fwd_bwd_prob(hmm_t *hmm);
* @sites: list of positions
*
* Same as hmm_run_fwd_bwd, in addition a pointer to a matrix with the new
* transition probabilities is returned. In this verison, emission
* transition probabilities is returned. In this version, emission
* probabilities are not updated.
*/
double *hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites);
Expand Down
28 changes: 14 additions & 14 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ OBJS = main.o vcfindex.o tabix.o \
vcfcnv.o vcfhead.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \
regidx.o smpl_ilist.o csq.o vcfbuf.o \
mpileup.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o bam2bcf_edlib.o \
read_consensus.o bam_sample.o \
read_consensus.o bam_sample.o \
vcfsort.o cols.o extsort.o dist.o abuf.o \
ccall.o em.o prob1.o kmin.o str_finder.o gff.o edlib.o
PLUGIN_OBJS = vcfplugin.o
Expand Down Expand Up @@ -105,7 +105,7 @@ endif

include config.mk

PACKAGE_VERSION = 1.20
PACKAGE_VERSION = 1.21

# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
Expand Down Expand Up @@ -235,7 +235,6 @@ vcfbuf_h = vcfbuf.h $(htslib_vcf_h)
abuf_h = abuf.h $(htslib_vcf_h)
dbuf_h = dbuf.h $(htslib_vcf_h)
bam2bcf_h = bam2bcf.h $(htslib_hts_h) $(htslib_vcf_h)
edlib.h = edlib.h
bam_sample_h = bam_sample.h $(htslib_sam_h)
cigar_state_h = cigar_state.h $(htslib_hts_h) $(htslib_sam_h)
read_consensus_h = read_consensus.h $(htslib_hts_h) $(htslib_sam_h)
Expand All @@ -249,17 +248,17 @@ vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reade
vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) $(bcftools_h)
vcfconvert.o: vcfconvert.c $(htslib_faidx_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kseq_h) $(htslib_hts_endian_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h)
vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h regidx.h
vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_hts_os_h) $(htslib_bgzf_h) $(bcftools_h) extsort.h filter.h
vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_hts_os_h) $(htslib_bgzf_h) $(bcftools_h) extsort.h $(filter_h)
vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h)
vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h)
vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(htslib_kbitset_h) $(htslib_hts_endian_h) $(bcftools_h) regidx.h vcmp.h $(htslib_khash_h) $(htslib_kbitset_h)
vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h regidx.h
vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) $(bcftools_h) $(filter_h)
vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(htslib_kbitset_h) $(htslib_hts_endian_h) $(bcftools_h) regidx.h vcmp.h $(htslib_khash_h)
vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h regidx.h $(filter_h)
vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(smpl_ilist_h)
vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h)
vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h
vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(htslib_hts_defs_h) $(bcftools_h) HMM.h rbuf.h
vcfhead.o: vcfhead.c $(htslib_kstring_h) $(htslib_vcf_h) $(bcftools_h)
vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h)
vcfsort.o: vcfsort.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_hts_os_h) kheap.h $(bcftools_h)
vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) $(bcftools_h)
vcfsort.o: vcfsort.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) $(htslib_bgzf_h) kheap.h $(bcftools_h)
vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) bin.h dist.h
vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h)
reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_kseq_h) $(htslib_thread_pool_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) $(khash_str2str_h)
Expand All @@ -276,7 +275,7 @@ mcall.o: mcall.c $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(call_h) $(prob1_h
prob1.o: prob1.c $(prob1_h)
vcmp.o: vcmp.c $(htslib_hts_h) $(htslib_vcf_h) vcmp.h
ploidy.o: ploidy.c $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_hts_h) $(bcftools_h) $(ploidy_h)
polysomy.o: polysomy.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(bcftools_h) peakfit.h
polysomy.o: polysomy.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_hts_defs_h) $(bcftools_h) peakfit.h
peakfit.o: peakfit.c peakfit.h $(htslib_hts_h) $(htslib_kstring_h)
bin.o: bin.c $(bcftools_h) bin.h
dist.o: dist.c dist.h
Expand All @@ -287,14 +286,15 @@ mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(hts
bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h
bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) $(str_finder_h)
bam2bcf_iaux.o: bam2bcf_iaux.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bcftools_h) $(bam2bcf_h) $(htslib_ksort_h) $(read_consensus_h) $(cigar_state_h)
bam2bcf_edlib.o: bam2bcf_edlib.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bcftools_h) $(bam2bcf_h) $(htslib_ksort_h) $(read_consensus_h) $(cigar_state_h) $(edlib.h)
bam2bcf_edlib.o: bam2bcf_edlib.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(str_finder_h) $(htslib_ksort_h) edlib.h
read_consensus.o: read_consensus.c $(read_consensus_h) $(cigar_state_h) $(bcftools_h) kheap.h
bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(khash_str2str_h) $(bam_sample_h) $(bcftools_h)
version.o: version.h version.c
hclust.o: hclust.c $(htslib_hts_h) $(htslib_kstring_h) $(bcftools_h) hclust.h
HMM.o: HMM.c $(htslib_hts_h) HMM.h
vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(vcfbuf_h) rbuf.h
vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(htslib_kbitset_h) $(bcftools_h) $(vcfbuf_h) rbuf.h
abuf.o: abuf.c $(htslib_vcf_h) $(bcftools_h) rbuf.h abuf.h
edlib.o: edlib.c edlib.h
extsort.o: extsort.c $(bcftools_h) extsort.h kheap.h
smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h)
gff.o: gff.c $(htslib_hts_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) gff.h regidx.h
Expand Down Expand Up @@ -326,7 +326,7 @@ test/test-rbuf.o: test/test-rbuf.c rbuf.h
test/test-rbuf: test/test-rbuf.o
$(CC) $(LDFLAGS) -o $@ $^ $(ALL_LIBS)

test/test-regidx.o: test/test-regidx.c $(htslib_kstring_h) $(htslib_hts_os_h) regidx.h
test/test-regidx.o: test/test-regidx.c $(htslib_kstring_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) regidx.h

test/test-regidx: test/test-regidx.o regidx.o | $(HTSLIB)
$(CC) $(ALL_LDFLAGS) -o $@ $^ $(HTSLIB_LIB) -lpthread $(ALL_LIBS)
Expand Down
127 changes: 124 additions & 3 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,124 @@
## Release 1.21 (12th September 2024)



Changes affecting the whole of bcftools, or multiple commands:

* Support multiple semicolon-separated strings when filtering by ID using -i/-e (#2190).
For example, `-i 'ID="rs123"'` now correctly matches `rs123;rs456`

* The filtering expression ILEN can be positive (insertion), negative (deletion), zero
(balanced substitutions), or set to missing value (symbolic alleles).

* bcftools query
* bcftools +split-vep

- The columns indices printed by default with `-H` (e.g., "#[1]CHROM") can be now
suppressed by giving the option twice `-HH` (#2152)


Changes affecting specific commands:

* bcftools annotate

- Support dynamic variables read from a tab-delimited annotation file (#2151)
For example, in the two cases below the field 'STR' from the -a file is required to match
the INFO/TAG in VCF. In the first example the alleles REF,ALT must match, in the second
example they are ignored. The option -k is required to output also records that were not
annotated:

bcftools annotate -a ann.tsv.gz -c CHROM,POS,REF,ALT,SCORE,~STR -i'TAG={STR}' -k in.vcf
bcftools annotate -a ann.tsv.gz -c CHROM,POS,-,-,SCORE,~STR -i'TAG={STR}' -k in.vcf

- When adding Type=String annotations from a tab-delimited file, encode characters with
special meaning using percent encoding (';', '=' in INFO and ':' in FORMAT) (#2202)

* bcftools consensus

- Allow to apply a reference allele which overlaps a previous deletion, there is no
need to complain about overlapping alleles in such case

- Fix a bug which required `-s -` to be present even when there were no samples in the VCF
(#2260)

* bcftools csq

- Fix a rare bug where indel combined with a substitution ending at exon boundary is
incorrectly predicted to have 'inframe' rather than 'frameshift' consequence (#2212)

* bcftools gtcheck

- Fix a segfault with --no-HWE-prob. The bug was introduced with the output format change in
1.19 which replaced the DC section with DCv2 (#2180)

- The number of matching genotypes in the DCv2 output was not calculated correctly with
non-zero `-E, --error-probability`. Consequently, also the average HWE score was incorrect.
The main output, the discordance score, was not affected by the bug

* bcftools +mendelian2

- Include the number of good cases where at least one of the trio genotypes has an alternate
allele (#2204)

- Fix the error message which would report the wrong sample when non-existent sample is given.
Note that bug only affected the error message, the program otherwise assigns the family
members correctly (#2242)

* bcftools merge

- Fix a severe bug in merging of FORMAT fields with Number=R and Number=A values. For example,
rows with high-coverage FORMAT/AD values (bigger or equal to 128) could have been assigned
to incorrect samples. The bug was introduced in version 1.19. For details see #2244.

* bcftools mpileup

- Return non-zero error code when the input BAM/CRAM file is truncated (#2177)

- Add FORMAT/AD annotation by default, disable with `-a -AD`

* bcftools norm

- Support realignment of symbolic <DUP.*> alleles, similarly to <DEL.*> added previously
(#1919,#2145)

- Fix in reporting reference allele genotypes with `--multi-overlaps .` (#2160)

- Support of duplicate removal of symbolic alleles of the same type but different SVLEN (#2182)

- New `-S, --sort` switch to optionally sort output records by allele (#1484)

- Add the `-i/-e` filtering options to select records for normalization. Note duplicate
removal ignores this option.

- Fix a bug where `--atomize` would not fill GT alleles for atomized SNVs followed by
an indel (#2239)

* bcftools +remove-overlaps

- Revamp the program to allow greater flexibility, with the following new options:

-M, --mark-tag TAG Mark -m sites with INFO/TAG
-m, --mark EXPR Mark (if also -M is present) or remove sites [overlap]
dup .. all overlapping sites
overlap .. overlapping sites
min(QUAL) .. mark sites with lowest QUAL until overlaps are resolved
--missing EXPR Value to use for missing tags with -m 'min(QUAL)'
0 .. the default
DP .. heuristics, scale maximum QUAL value proportionally to INFO/DP
--reverse Apply the reverse logic, for example preserve duplicates instead of removing
-O, --output-type t t: plain list of sites (chr,pos), tz: compressed list

* bcftools +tag2tag

- The conversions --LXX-to-XX, --XX-to-LXX were working but specific cases such as --LAD-to-AD were not.

- Print more informative error message when source tag type violiates VCF specification

* bcftools +trio-dnm2

- Better handling of the --strictly-novel functionality, especically with respect to chrX inheritance


## Release 1.20 (15th April 2024)


Expand Down Expand Up @@ -716,7 +837,7 @@ Changes affecting specific commands:
annotating from a tab-delimited text file, this feature can be invoked
by using `-c INFO/END`.

- add a new '.' modifier to control wheter missing values should be carried
- add a new '.' modifier to control whether missing values should be carried
over from a tab-delimited file or not. For example:

-c TAG .. adds TAG if the source value is not missing. If TAG
Expand Down Expand Up @@ -1068,7 +1189,7 @@ Changes affecting specific commands:

* bcftools csq:

- Fix a bug wich caused incorrect FORMAT/BCSQ formatting at sites with too
- Fix a bug which caused incorrect FORMAT/BCSQ formatting at sites with too
many per-sample consequences

- Fix a bug which incorrectly handled the --ncsq parameter and could clash
Expand Down Expand Up @@ -1785,7 +1906,7 @@ Updates, improvements and bugfixes for many other commands:

* `roh`: Now possible to process multiple samples at once. This allows
considerable speedups for files with thousands of samples where the cost of
HMM is neglibible compared to I/O and decompressing. In order to fit tens of
HMM is negligible compared to I/O and decompressing. In order to fit tens of
thousands samples in memory, a sliding HMM can be used (new `--buffer-size`
option). Viterbi training now uses Baum-Welch algorithm, and works much
better. Support for gVCFs or FORMAT/PL tags. Added `-o, output` and
Expand Down
46 changes: 42 additions & 4 deletions abuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ typedef struct
kstring_t ref, alt;
int ial; // the index of the original ALT allele, 1-based
int beg, end; // 0-based inclusive offsets to ref,alt
int plen; // the ref,alt prefix length, eg plen=1 for C>CA
}
atom_t;

Expand Down Expand Up @@ -175,8 +176,9 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
atom->alt.l = 0;
kputc(refb, &atom->ref);
kputc(refb, &atom->alt);
atom->beg = atom->end = i;
atom->ial = ial;
atom->beg = atom->end = i;
atom->ial = ial;
atom->plen = 1;
}
continue;
}
Expand All @@ -202,6 +204,35 @@ static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
if ( rcmp ) return rcmp;
return strcasecmp(a->alt.s,b->alt.s);
}

// returns
// 0 .. identical beg,ref,alt
// 1 .. non-overlapping variants, but record may overlap (A>AT vs A>C)
// 2 .. overlapping (conflicting) variants
static int _atoms_overlap(const atom_t *a, const atom_t *b)
{
if ( a->beg < b->beg ) return 2;
if ( a->beg > b->beg ) return 2;

// consider SNV followed by DEL as not overlapping
// CC > C a.plen=1 (ref,alt prefix len=1)
// C > T b.plen=0 (ref,alt prefix len=0)
if ( a->plen && a->plen >= b->ref.l ) return 1;
if ( b->plen && b->plen >= a->ref.l ) return 1;

int rcmp = strcasecmp(a->ref.s,b->ref.s);
if ( rcmp ) return 2;

// consider SNV followed by INS as not overlapping
// A > AT a.plen=1 (ref,alt prefix len=1)
// A > C b.plen=0 (ref,alt prefix len=0)
if ( a->plen && a->plen >= b->alt.l ) return 1;
if ( b->plen && b->plen >= a->alt.l ) return 1;

rcmp = strcasecmp(a->alt.s,b->alt.s);
if ( rcmp ) return 2;
return 0;
}
/*
For reproducibility of tests on different platforms, we need to guarantee the same order of identical
atoms originating from different source ALTs. Even though they are consistent, different values can be
Expand Down Expand Up @@ -238,7 +269,14 @@ static void _split_table_new(abuf_t *buf, atom_t *atom)
static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
{
uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
int olap = _atoms_overlap(atom,buf->split.atoms[iout]);
ptr[atom->ial-1] = olap > 1 ? 2 : 1;

// The test test/atomize.split.5.vcf shows why we sometimes can and sometimes
// cannot remove the star allele like this
// buf->split.overlaps[iout] = olap > 1 ? 1 : 0;
// I forgot the details of the code, so don't immediately see
// if this could be made smarter
buf->split.overlaps[iout] = 1;
}
#if 0
Expand Down Expand Up @@ -745,7 +783,7 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec)
_split_table_init(buf,rec,buf->natoms);
for (i=0; i<buf->natoms; i++)
{
if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
if ( i && _atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i])==0 ) continue;
_split_table_new(buf, &buf->atoms[i]); // add a new unique output atom
}
for (i=0; i<buf->natoms; i++)
Expand Down
Loading

0 comments on commit bcf6c62

Please sign in to comment.